# Interprétation des résultats sur des entraînements parcellaires

**Avant**:
1. Réduction du corpus en % du corpus total, 0.001, 0.005, etc.
2. Entraînement sur deux modèles (couches RNN différentes)
3. Test de chacun des entraînements sur le corpus de test le plus gros (80% du vrai corpus)

**Ici**:

1. Parser le log
2. Transformer en dataframe
3. Analyser

## Parsage

In [14]:
import regex

def parse_task(line):
    for m in regex.findall("::: Evaluation report for task: (\w+)", line.strip()):
        return m
    
def parse_model(line):
    percent = float(regex.findall("(\d,\d+)", line)[0].replace(",", "."))
    layer = 1 if "simpler" in line else 2
    return (percent, layer)

def parse_accuracy(line):
    return float(line[len("  accuracy: "):].strip())

models = []
with open("percent.log") as f:
    current_task, current_model = None, None
    for line in f:
        if line.startswith("::: Evaluation report for task: "):
            current_task = parse_task(line)
        elif line.startswith("archived-models/"):
            size, layer = parse_model(line)
            models.append({"RNN": layer, "% Corpus": size})
        elif current_task and line.startswith("  accuracy: "):
            models[-1][current_task] = parse_accuracy(line)
            current_task = None

models = sorted(models, key=lambda x: x["% Corpus"])
models

[{'RNN': 2, '% Corpus': 0.005, 'lemma': 0.0433, 'pos': 0.6921, 'Gend': 0.8311},
 {'RNN': 2, '% Corpus': 0.01, 'lemma': 0.0, 'pos': 0.2829, 'Gend': 0.7535},
 {'RNN': 1, '% Corpus': 0.01, 'lemma': 0.7408, 'pos': 0.7907, 'Gend': 0.8508},
 {'RNN': 2, '% Corpus': 0.05, 'lemma': 0.8796, 'pos': 0.9002, 'Gend': 0.9124},
 {'RNN': 1, '% Corpus': 0.05, 'lemma': 0.8895, 'pos': 0.8868, 'Gend': 0.9025},
 {'RNN': 2, '% Corpus': 0.075, 'lemma': 0.9209, 'pos': 0.9206, 'Gend': 0.9302},
 {'RNN': 1, '% Corpus': 0.075, 'lemma': 0.8994, 'pos': 0.9015, 'Gend': 0.9141},
 {'RNN': 2, '% Corpus': 0.1, 'lemma': 0.9363, 'pos': 0.9288, 'Gend': 0.9332},
 {'RNN': 1, '% Corpus': 0.1, 'lemma': 0.9165, 'pos': 0.9169, 'Gend': 0.9246},
 {'RNN': 2, '% Corpus': 0.2, 'lemma': 0.9492, 'pos': 0.9474, 'Gend': 0.9484},
 {'RNN': 1, '% Corpus': 0.2, 'lemma': 0.9399, 'pos': 0.9385, 'Gend': 0.9432},
 {'RNN': 2, '% Corpus': 0.4, 'lemma': 0.9679, 'pos': 0.9609, 'Gend': 0.9609},
 {'RNN': 1, '% Corpus': 0.4, 'lemma': 0.954, 'pos': 0.952

## Dataframe et dump csv

In [15]:
import pandas as pd
import pandasql as ps

DF = pd.DataFrame(models)
DF = ps.sqldf("""SELECT 
                RNN, `% Corpus`, lemma, pos, gend
            FROM 
                DF as orig
            WHERE (
                RNN = 1 AND lemma > (
                    SELECT
                        lemma 
                    FROM 
                        DF as comp
                    WHERE 
                        comp.RNN = 2 AND comp.`% Corpus` = orig.`% Corpus`
                    LIMIT 1
                )
            ) OR (
                RNN = 2 AND lemma > (
                    SELECT
                        lemma 
                    FROM 
                        DF as comp
                    WHERE 
                        comp.RNN = 1 AND comp.`% Corpus` = orig.`% Corpus`
                    LIMIT 1
                )
            )
            ORDER BY orig.`% Corpus`
            """)

gain = {
    "Δ(lemma)": [],
    "Δ(pos)": [],
    "Δ(Gend)": []
}
last = {
    task: None
    for task in ("lemma", "pos", "Gend")
}
for row in DF.index:
    cur = DF.loc[row]
    for task in ("lemma", "pos", "Gend"):
        if last[task]:
            gain[f"Δ({task})"].append(cur[task] - last[task])
        else:
            gain[f"Δ({task})"].append(0)
        last[task] = cur[task]
        
for idx, (task, insert) in enumerate(gain.items()):
    DF.insert(3 + idx*2, task, insert, True) 
print(DF.to_latex()) 
print(str(DF))

\begin{tabular}{lrrrrrrrr}
\toprule
{} &  RNN &  \% Corpus &   lemma &  Δ(lemma) &     pos &  Δ(pos) &    Gend &  Δ(Gend) \\
\midrule
0 &    1 &     0.010 &  0.7408 &    0.0000 &  0.7907 &  0.0000 &  0.8508 &   0.0000 \\
1 &    1 &     0.050 &  0.8895 &    0.1487 &  0.8868 &  0.0961 &  0.9025 &   0.0517 \\
2 &    2 &     0.075 &  0.9209 &    0.0314 &  0.9206 &  0.0338 &  0.9302 &   0.0277 \\
3 &    2 &     0.100 &  0.9363 &    0.0154 &  0.9288 &  0.0082 &  0.9332 &   0.0030 \\
4 &    2 &     0.200 &  0.9492 &    0.0129 &  0.9474 &  0.0186 &  0.9484 &   0.0152 \\
5 &    2 &     0.400 &  0.9679 &    0.0187 &  0.9609 &  0.0135 &  0.9609 &   0.0125 \\
6 &    2 &     0.600 &  0.9702 &    0.0023 &  0.9637 &  0.0028 &  0.9645 &   0.0036 \\
7 &    2 &     0.800 &  0.9748 &    0.0046 &  0.9660 &  0.0023 &  0.9697 &   0.0052 \\
\bottomrule
\end{tabular}

   RNN  % Corpus   lemma  Δ(lemma)     pos  Δ(pos)    Gend  Δ(Gend)
0    1     0.010  0.7408    0.0000  0.7907  0.0000  0.8508   0.0000
1    1 

## Export des résultats de parsage

In [16]:
import csv

DF.to_csv("out-percent.csv")

## Parsage sur Vulgate

In [17]:
import regex

def parse_task(line):
    for m in regex.findall("::: Evaluation report for task: (\w+)", line.strip()):
        return m
    
def parse_model(line):
    percent = float(regex.findall("(\d,\d+)", line)[0].replace(",", "."))
    layer = 1 if "simpler" in line else 2
    return (percent, layer)

def parse_accuracy(line):
    return float(line[len("  accuracy: "):].strip())

results_latin_tardif = []
with open("percent-full-test.log") as f:
    current_task, current_model = None, None
    for line in f:
        if line.startswith("::: Evaluation report for task: "):
            current_task = parse_task(line)
        elif line.startswith("archived-models/"):
            size, layer = parse_model(line)
            results_latin_tardif.append({"RNN": layer, "% Corpus": size})
        elif current_task and line.startswith("  accuracy: "):
            results_latin_tardif[-1][current_task] = parse_accuracy(line)
            current_task = None

results_latin_tardif = sorted(results_latin_tardif, key=lambda x: x["% Corpus"])
# print(results_latin_tardif)
nb_rows, _ = DF.shape
perf = [.0 for _ in range(nb_rows)]

perf = {
    "θ lemma": [.0 for _ in range(nb_rows)],
    "θ pos": [.0 for _ in range(nb_rows)],
    "θ Gend": [.0 for _ in range(nb_rows)],
    "Δ(θ lemma)": [.0 for _ in range(nb_rows)],
    "Δ(θ pos)": [.0 for _ in range(nb_rows)],
    "Δ(θ Gend)": [.0 for _ in range(nb_rows)]
}
for model in results_latin_tardif:
    data = DF[DF["RNN"] == model["RNN"]][DF["% Corpus"] == model["% Corpus"]]
    if len(data):
        row_index = data.index.tolist()[0]
        for task in ("pos", "lemma", "Gend"):
            perf[f"θ {task}"][row_index] = model[task]
            if row_index != 0:
                perf[f"Δ(θ {task})"][row_index] = model[task] - perf[f"θ {task}"][row_index-1]
            
for idx, task in enumerate(("lemma", )):#, "pos", "Gend")):
    insert = perf[f"θ {task}"]
    DF.insert(4 + idx*3 + idx, f"θ {task}", insert, True) 
    insert = perf[f"Δ(θ {task})"]
    DF.insert(4 + idx*3 + idx + 1, f"Δ(θ {task})", insert, True) 
print(str(DF))

   RNN  % Corpus   lemma  Δ(lemma)  θ lemma  Δ(θ lemma)     pos  Δ(pos)  \
0    1     0.010  0.7408    0.0000   0.7246      0.0000  0.7907  0.0000   
1    1     0.050  0.8895    0.1487   0.8575      0.1329  0.8868  0.0961   
2    2     0.075  0.9209    0.0314   0.8554     -0.0021  0.9206  0.0338   
3    2     0.100  0.9363    0.0154   0.8943      0.0389  0.9288  0.0082   
4    2     0.200  0.9492    0.0129   0.8818     -0.0125  0.9474  0.0186   
5    2     0.400  0.9679    0.0187   0.9225      0.0407  0.9609  0.0135   
6    2     0.600  0.9702    0.0023   0.9240      0.0015  0.9637  0.0028   
7    2     0.800  0.9748    0.0046   0.9104     -0.0136  0.9660  0.0023   

     Gend  Δ(Gend)  
0  0.8508   0.0000  
1  0.9025   0.0517  
2  0.9302   0.0277  
3  0.9332   0.0030  
4  0.9484   0.0152  
5  0.9609   0.0125  
6  0.9645   0.0036  
7  0.9697   0.0052  




In [24]:
print(
    DF.to_latex(index=False, float_format="{:0.3f}".format)\
        .replace("Δ", "$\\Delta$")\
        .replace("θ", "$\\theta$")\
        .replace("lemma", "Lemme")\
        .replace("pos", "POS")\
        .replace("Gend", "Genre")
)

\begin{tabular}{rrrrrrrrrr}
\toprule
 RNN &  \% Corpus &  Lemme &  $\Delta$(Lemme) &  $\theta$ Lemme &  $\Delta$($\theta$ Lemme) &   POS &  $\Delta$(POS) &  Genre &  $\Delta$(Genre) \\
\midrule
   1 &     0.010 &  0.741 &     0.000 &    0.725 &       0.000 & 0.791 &   0.000 & 0.851 &    0.000 \\
   1 &     0.050 &  0.889 &     0.149 &    0.858 &       0.133 & 0.887 &   0.096 & 0.902 &    0.052 \\
   2 &     0.075 &  0.921 &     0.031 &    0.855 &      -0.002 & 0.921 &   0.034 & 0.930 &    0.028 \\
   2 &     0.100 &  0.936 &     0.015 &    0.894 &       0.039 & 0.929 &   0.008 & 0.933 &    0.003 \\
   2 &     0.200 &  0.949 &     0.013 &    0.882 &      -0.012 & 0.947 &   0.019 & 0.948 &    0.015 \\
   2 &     0.400 &  0.968 &     0.019 &    0.922 &       0.041 & 0.961 &   0.013 & 0.961 &    0.012 \\
   2 &     0.600 &  0.970 &     0.002 &    0.924 &       0.002 & 0.964 &   0.003 & 0.965 &    0.004 \\
   2 &     0.800 &  0.975 &     0.005 &    0.910 &      -0.014 & 0.966 &   0.002 & 0.