# Retrieve content

In [13]:
from collections import namedtuple, Counter
import numpy as np

Dataset = namedtuple("Dataset", ["lemmas", "forms"])
Samples = namedtuple("Samples", ["train", "dev", "test"])
Corpus = namedtuple("Corpus", ["dataset", "samples"])

def get_lemmas(file: str = "./dataset/main/dev.txt"):
    out = {
        "positive": Dataset([], []),
        "negative": Dataset([], [])
    }
    
    with open(file) as f:
        for idx, orig_line in enumerate(f):
            if idx == 0:
                headers = orig_line.strip().split("\t")[1:]
                continue
            line = orig_line.split("\t")
            if len(line) != len(headers):
                if orig_line.startswith("#[TAG]"):
                    cat = orig_line.strip()[6:]
                continue
            line = dict(zip(headers, line))
            out[cat].lemmas.append(line["lemma"])
            out[cat].forms.append(line["token"])
    return out

def get_corpus(folder="./dataset/main/"):
    Global = {
        "positive": Dataset([], []),
        "negative": Dataset([], []),
    }
    samples = {
        "positive": [0, 0, 0],
        "negative": [0, 0, 0]
    }
    for idx, file in enumerate([f"{folder}train.txt", f"{folder}dev.txt", f"{folder}test.txt"]):
        current_output = get_lemmas(file)
        for key, dataset in current_output.items():
            for i in range(len(dataset)):
                Global[key][i].extend(dataset[i])
        
        with open(file) as f:
            for orig_line in f:
                if orig_line.startswith("#[TAG]"):
                    cat = orig_line.strip()[6:]
                    samples[cat][idx] += 1
    return Corpus(Global, {key: Samples(*val) for key, val in samples.items()})

Main = get_corpus()

# MLTD and Other metrics function

In [14]:
# from https://github.com/kristopherkyle/lexical_diversity/blob/master/lexical_diversity/lex_div.py


def safe_divide(numerator, denominator):
    if denominator == 0 or denominator == 0.0:
        index = 0
    else: index = numerator/denominator
    return index


def ttr(text):
    ntokens = len(text)
    ntypes = len(set(text))
    
    return safe_divide(ntypes,ntokens)

def mtld(input, min = 10): #original MTLD described in Jarvis & McCarthy 
    def mtlder(text):
        factor = 0
        factor_lengths = 0
        start = 0
        for x in range(len(text)):
            factor_text = text[start:x+1]
            if x+1 == len(text):
                factor += safe_divide((1 - ttr(factor_text)),(1 - .72))
                factor_lengths += len(factor_text)
            else:
                if ttr(factor_text) < .720 and len(factor_text) >= min:
                    factor += 1
                    factor_lengths += len(factor_text)
                    start = x+1
                else:
                    continue

        mtld = safe_divide(factor_lengths,factor)
        return mtld
    input_reversed = list(reversed(input))
    mtld_full = safe_divide((mtlder(input)+mtlder(input_reversed)),2)
    return mtld_full

# Our own code
def partage(k, j):
    return len(set(k).intersection(set(j))) / len(set(k)) * 100

def distribution(data):
    x = list(Counter(data).values())
    return np.std(x)

## Déviation standard

In [17]:
def get_pos_negs(theCorpus):
    positives = {
        "Taille": len(theCorpus.dataset["positive"].forms), 
        "Richesse": len(set(theCorpus.dataset["positive"].forms)),
        "MTLD(Lemmes)": mtld(theCorpus.dataset["positive"].lemmas), 
        "MTLD(Formes)": mtld(theCorpus.dataset["positive"].forms), 
        "Distribution(Lemmes)": distribution(theCorpus.dataset["positive"].lemmas),
        "Distribution(Formes)": distribution(theCorpus.dataset["positive"].forms),
        "Partage(Lemmes)": partage(theCorpus.dataset["positive"].lemmas, theCorpus.dataset["negative"].lemmas),
        "Partage(Formes)": partage(theCorpus.dataset["positive"].forms, theCorpus.dataset["negative"].forms),
        "Documents": theCorpus.samples["positive"]
    }
    negatives = {
        "Taille": len(theCorpus.dataset["negative"].forms), 
        "Richesse": len(set(theCorpus.dataset["negative"].forms)),
        "MTLD(Lemmes)": mtld(theCorpus.dataset["negative"].lemmas), 
        "MTLD(Formes)": mtld(theCorpus.dataset["negative"].forms), 
        "Distribution(Lemmes)": distribution(theCorpus.dataset["negative"].lemmas),
        "Distribution(Formes)": distribution(theCorpus.dataset["negative"].forms),
        "Partage(Lemmes)": partage(theCorpus.dataset["negative"].lemmas, theCorpus.dataset["positive"].lemmas),
        "Partage(Formes)": partage(theCorpus.dataset["negative"].forms, theCorpus.dataset["positive"].forms),
        "Documents": theCorpus.samples["negative"]
    }
    return positives, negatives

In [28]:
from pandas import DataFrame
df_main = DataFrame([*get_pos_negs(Main)], index=["Positifs", "Négatifs"]).transpose()
print(df_main.to_latex(float_format=lambda x: f"{x:.2f}"))

\begin{tabular}{lll}
\toprule
{} &          Positifs &             Négatifs \\
\midrule
Taille               &             44964 &               525220 \\
Richesse             &             13305 &                80153 \\
MTLD(Lemmes)         &            153.90 &               114.70 \\
MTLD(Formes)         &            384.53 &               286.56 \\
Distribution(Lemmes) &             62.46 &               393.70 \\
Distribution(Formes) &             40.69 &               191.22 \\
Partage(Lemmes)      &             78.12 &                22.39 \\
Partage(Formes)      &             64.55 &                10.72 \\
Documents            &  (2013, 252, 251) &  (19940, 2493, 2491) \\
\bottomrule
\end{tabular}



In [20]:
Partial = get_corpus("./dataset/main-partial/")
Metaphors = get_corpus("./dataset/metaphors/")
Literals = get_corpus("./dataset/inversed-metaphors/")
Literals = get_corpus("./dataset/inversed-metaphors/")

In [27]:
df_partial = DataFrame([*get_pos_negs(Partial)], index=["Positifs", "Négatifs"]).transpose()
print(df_partial.to_latex(float_format=lambda x: f"{x:.2f}"))


\begin{tabular}{lll}
\toprule
{} &         Positifs &            Négatifs \\
\midrule
Taille               &            16162 &              187673 \\
Richesse             &             6625 &               41581 \\
MTLD(Lemmes)         &           155.93 &              113.20 \\
MTLD(Formes)         &           397.46 &              295.16 \\
Distribution(Lemmes) &            28.92 &              178.27 \\
Distribution(Formes) &            20.52 &               94.54 \\
Partage(Lemmes)      &            76.42 &               20.85 \\
Partage(Formes)      &            56.62 &                9.02 \\
Documents            &  (420, 252, 251) &  (3970, 2493, 2491) \\
\bottomrule
\end{tabular}



In [31]:
df_partial.drop(labels="Documents") - df_main.drop(labels="Documents")

Unnamed: 0,Positifs,Négatifs
Taille,-28802.0,-337547.0
Richesse,-6680.0,-38572.0
MTLD(Lemmes),2.025764,-1.507228
MTLD(Formes),12.929786,8.599699
Distribution(Lemmes),-33.547779,-215.43442
Distribution(Formes),-20.165063,-96.677287
Partage(Lemmes),-1.703276,-1.539512
Partage(Formes),-7.935811,-1.694809


In [30]:
print(Partial.samples)
print(Main.samples)
print(Literals.samples)
print(Metaphors.samples)
from pandas import MultiIndex

indexes= MultiIndex.from_tuples([
    (corpus, ds)
    for corpus in ["Général", "Partiel", "Litéral", "Métaphores"]
    for ds in ["Positif", "Négatif"]
], names=["Corpus", "Set"])

df = DataFrame([
    Main.samples["positive"],
    Main.samples["negative"],
    Partial.samples["positive"],
    Partial.samples["negative"],
    Literals.samples["positive"],
    Literals.samples["negative"],
    Metaphors.samples["positive"],
    Metaphors.samples["negative"],
    #Partial.samples,
    #Literals.samples,
    #Metaphors.samples
], index=indexes).sort_index(level="Set")

print(df.to_latex())

{'positive': Samples(train=420, dev=252, test=251), 'negative': Samples(train=3970, dev=2493, test=2491)}
{'positive': Samples(train=2013, dev=252, test=251), 'negative': Samples(train=19940, dev=2493, test=2491)}
{'positive': Samples(train=1439, dev=618, test=459), 'negative': Samples(train=15701, dev=1745, test=7478)}
{'positive': Samples(train=413, dev=46, test=2057), 'negative': Samples(train=15701, dev=1745, test=7478)}
\begin{tabular}{llrrr}
\toprule
        &         &  train &   dev &  test \\
Corpus & Set &        &       &       \\
\midrule
Général & Négatif &  19940 &  2493 &  2491 \\
Litéral & Négatif &  15701 &  1745 &  7478 \\
Métaphores & Négatif &  15701 &  1745 &  7478 \\
Partiel & Négatif &   3970 &  2493 &  2491 \\
Général & Positif &   2013 &   252 &   251 \\
Litéral & Positif &   1439 &   618 &   459 \\
Métaphores & Positif &    413 &    46 &  2057 \\
Partiel & Positif &    420 &   252 &   251 \\
\bottomrule
\end{tabular}

