# Libraries

In [74]:
import os
import pandas as pd
import numpy as np
import re, operator
from operator import itemgetter
from itertools import islice

# stats

from scipy import stats
from scipy import special
from scipy.spatial.distance import cosine
from scipy.stats import zipf

# dataviz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Utils

In [46]:
def read_fcu(filename, corpus_name):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # description du corpus
    print("========> Counting bsv..")
    total = 0
    for root, dirs, files in os.walk(corpus_name):
        total += len(files)
    print("\nTotal : {} bsv dans le corpus".format(total))

    
    # create df
    print("\n========> Creating a dataframe..")
    df = pd.read_csv(filename, delimiter="\t", encoding='utf-8')

    
    # short descriptions
    print("\n========> Making short descriptions..")
    print("\nPrefLabel :\n")
    print(df['prefLabel'].describe())
    print("\n\nBSV :\n")
    print(df['bsv'].describe())
    #print("\n\nLength :\n")
    #print(df['length'].describe())
    
    # save dictionary of extracted labels
    label_counter = df['prefLabel'].value_counts()
    label_dict = label_counter.to_dict()
    df_freq = pd.DataFrame.from_dict(label_dict,orient='index', columns=['count'])
    #df_freq.reset_index(level='prefLabel')
    df_freq = df_freq.reset_index().rename({'index':'prefLabel'}, axis = 'columns')
    #df_freq['prefLabel'] = df_freq.index
    print("\nTop 10 labels :\n{}".format(df_freq[:10]))
    


    return df, df_freq



def show_frequencies(df_freq):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # show scatter graph
    fig = px.scatter(df_freq, x=df_freq['prefLabel'], y=df_freq['count'])
    fig.show()
    
    # show tree graph
    fig = px.treemap(df_freq, path=df_freq , values='count')
    fig.show()
    
    return True




def show_differences(df1, df2):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    print(df1['prefLabel'].isin(df2['prefLabel']).value_counts())
    print(df1[~df1['prefLabel'].isin(df2['prefLabel'])])
    
    return True



def scatter_matrix(df, list_of_dimensions):
    
    '''
    Scatter plots show how much one variable is affected by another or the relationship between them with the help of dots in two dimensions. 
    Scatter plots are very much like line graphs in the concept that they use horizontal and vertical axes to plot data points
    
    INPUT :
    OUTPUT :
    '''
    
    fig = px.scatter_matrix(df,
    dimensions=list_of_dimensions,
    color=df.index,
    title="Scatter matrix",
    labels={col:col.replace('_', ' ') for col in df.columns}) # remove underscore
    fig.update_traces(diagonal_visible=True)
    fig.show()
    return True



def cosine_similarity(col1, col2):
    '''
    The cosine measure similarity is a similarity metric that depends on envisioning user preferences as points in space.  
    The cosine of a small angle is near 1, and the cosine of a large angle near 180 degrees is close to –1. 
    This is good, because small angles should map to high similarity, near 1, and large angles should map to near –1
    
    INPUT :
    OUTPUT :
    '''
    
    result = (1 - cosine(col1, col2))
    print(result)
    return result

# Dataviz

### FCU

#### D2KAB

In [47]:
d2kab_fcu, d2kab_freq = read_fcu("../output/d2kab/fcu/fcu_d2kab.csv", "../resources/Corpus-v-12-04-21/train/")


Total : 230 bsv dans le corpus



PrefLabel :

count      9174
unique      167
top       vigne
freq       1072
Name: prefLabel, dtype: object


BSV :

count                                                  9174
unique                                                  230
top       BSV_legumes_allium_pomme_de_terre_AURA_2019-14...
freq                                                    251
Name: bsv, dtype: object

Top 10 labels :
         prefLabel  count
0            vigne   1072
1            colza    655
2     chou potager    407
3             chou    407
4              blé    384
5             maïs    377
6           tomate    346
7  arbuste à baies    286
8          carotte    254
9             pois    235


In [48]:
# show graphs
show_frequencies(d2kab_freq)

True

#### VESPA

In [49]:
vespa_fcu, vespa_freq = read_fcu("../output/vespa/fcu/fcu_vespa.csv", "../resources/corpusVespa.html")


Total : 497 bsv dans le corpus



PrefLabel :

count       12773
unique        196
top       pommier
freq          742
Name: prefLabel, dtype: object


BSV :

count                  12773
unique                   444
top       BSV_legumes_11_002
freq                     414
Name: bsv, dtype: object

Top 10 labels :
                 prefLabel  count
0                  pommier    742
1                    colza    696
2  arboriculture fruitière    561
3                     pois    494
4                      blé    394
5             chou potager    380
6                     chou    380
7                  céréale    354
8             floriculture    347
9                  poirier    339


In [50]:
show_frequencies(vespa_freq)

True

#### Differences

In [52]:
fig = make_subplots(rows=1, cols=2)


fig.add_trace(
    go.Scatter(
        x=d2kab_freq['prefLabel'],
        y=d2kab_freq['count']
    ),
    row=1, col=1
)


fig.add_trace(
    go.Scatter(
        x=vespa_freq['prefLabel'],
        y=vespa_freq['count']
    ),
    row=1, col=2
)



fig.update_layout(height=600, width=1000, title_text="Term frequency D2KAB vs. VESPA")
fig.show()

In [53]:
# words in d2kab that are not in vespa
show_differences(d2kab_freq, vespa_freq)

True     153
False     14
Name: prefLabel, dtype: int64
            prefLabel  count
60     grande culture     22
65        pois chiche     20
95             cardon      9
103             ricin      8
115          pastèque      5
119        potimarron      5
125           basilic      4
128           oseille      4
132           houblon      4
134           potiron      4
152  courge butternut      2
154  sorgho fourrager      1
155   abricotier pays      1
156     légume racine      1


True

In [54]:
# words in vespa that are not in d2kab
show_differences(vespa_freq, d2kab_freq)

True     153
False     43
Name: prefLabel, dtype: int64
                    prefLabel  count
63                  actinidia     45
64               chrysanthème     44
96                     seigle     14
97                   cerfeuil     14
101  horticulture ornementale     13
104  pomme de terre féculière     12
110                  photinie     10
111    olivier fruit à noyaux     10
115               groseillier     10
117         bananier plantain     10
123                poinsettia      8
127             canne à sucre      8
132                 limettier      7
134                    pomelo      6
135            plante à fibre      6
138                  myrtille      6
139              lin à fibres      6
145                    millet      5
149               mandarinier      4
153                    tangor      4
154                cassissier      4
157                   gerbera      3
158                 calebasse      3
159         cerfeuil tubéreux      3
160               c

True

In [55]:
# concatenate frequence matrices and keep all values
result = pd.concat([d2kab_freq.set_index('prefLabel').rename({'count': 'count_d2kab'}, axis='columns'), vespa_freq.set_index('prefLabel').rename({'count': 'count_vespa'}, axis='columns')], axis=1, sort=False)

# if no values were not found, then it the value is set to 0
result = result.fillna(0).astype(int)

# show
result[:3]

Unnamed: 0,count_d2kab,count_vespa
vigne,1072,314
colza,655,696
chou potager,407,380


In [56]:
scatter_matrix(df = result, list_of_dimensions = ["count_d2kab", "count_vespa"])

True

In [57]:
# idf is irrelevant when doing a Pearson mean correlation. All tf values for the same term will be multiplied by the same idf value yielding the final tf-idf. The PMC is invariant with respect to scaling of the input, so the idf is canceled out here. Hence all that matters in your proposed idea is the tf.
# If the coefficient value lies between ± 0.50 and ± 1, then it is said to be a strong correlation
# Negative coefficients represent cases when the value of one variable increases, the value of the other variable tends to decrease.
# Examples :
# 0.5 - 0.7 indicate variables which can be considered moderately correlated
# 0: No relationship. As one value increases, there is no tendency for the other value to change in a specific direction.
# -1: A perfect negative relationship.
# -0.8: A fairly strong negative relationship.
# -0.6 A moderate negative relationship.
result["count_d2kab"].corr(result["count_vespa"], method ='pearson')

0.6372818035002872

# ToMap

### D2KAB

In [58]:
d2kab_fcu_tomap, d2kab_freq_tomap = read_fcu("../output/d2kab-fcu-tomap.csv", "../resources/Corpus-v-12-04-21/train/")



Total : 230 bsv dans le corpus



PrefLabel :

count       15686
unique        347
top       carotte
freq          634
Name: prefLabel, dtype: object


BSV :

count                                15686
unique                                 581
top       bsv_gc_mp_n27_16052019_cle046a11
freq                                   500
Name: bsv, dtype: object

Top 10 labels :
                          prefLabel  count
0                           carotte    634
1  culture légumière de plein champ    571
2                             colza    422
3                              maïs    324
4                             vigne    321
5                      chou potager    304
6                              pois    294
7                              chou    260
8                               blé    255
9                            tomate    250


In [59]:
d2kab_fcu_tomap[:12554]['bsv'].describe()

count                                12554
unique                                 228
top       bsv_gc_mp_n27_16052019_cle046a11
freq                                   500
Name: bsv, dtype: object

In [61]:
# save dictionary of extracted labels
label_counter = d2kab_fcu_tomap[:12554]['prefLabel'].value_counts()
label_dict = label_counter.to_dict()
d2kab_freq_tomap = pd.DataFrame.from_dict(label_dict,orient='index', columns=['count'])
d2kab_freq_tomap = d2kab_freq_tomap.reset_index().rename({'index':'prefLabel'}, axis = 'columns')
show_frequencies(d2kab_freq_tomap)

True

In [62]:
# words that are in baseline d2kab but not in tomap
show_differences(d2kab_freq, d2kab_freq_tomap)

True     149
False     18
Name: prefLabel, dtype: int64
           prefLabel  count
67           olivier     19
93            agrume      9
101         manguier      8
113           ananas      5
116    bananier musa      5
121          figuier      5
122              riz      5
126         phacélie      4
137        pissenlit      3
139         roquette      3
141        noisetier      3
144         géranium      2
155  abricotier pays      1
156    légume racine      1
157      châtaignier      1
161           rosier      1
162          chanvre      1
165     clémentinier      1


True

In [63]:
show_differences(d2kab_freq_tomap, d2kab_freq)

True     149
False    107
Name: prefLabel, dtype: int64
                        prefLabel  count
9    choux-navets à chair blanche    218
11                    petits pois    208
13                   pois potager    164
15        arbre à fruits à coques    157
17                 légumes fleurs    145
..                            ...    ...
251           moutarde fourragère      1
252          moutarde oléagineuse      1
253                rosier odorant      1
254            avoine nue d'hiver      1
255                moutarde brune      1

[107 rows x 2 columns]


True

In [64]:
# concatenate frequence matrices and keep all values
result_d2kab_tomap = pd.concat([d2kab_freq_tomap.set_index('prefLabel').rename({'count': 'count_tomap'}, axis='columns'), d2kab_freq.set_index('prefLabel').rename({'count': 'count_baseline'}, axis='columns')], axis=1, sort=False)

# if no values were not found, then it the value is set to 0
result_d2kab_tomap = result_d2kab_tomap.fillna(0).astype(int)

# show
result_d2kab_tomap[:10]

Unnamed: 0,count_tomap,count_baseline
carotte,630,254
culture légumière de plein champ,540,16
colza,409,655
maïs,308,377
vigne,305,1072
pois,257,235
chou potager,249,407
tomate,247,346
blé,240,384
choux-navets à chair blanche,218,0


In [65]:
scatter_matrix(result_d2kab_tomap, list_of_dimensions = ["count_tomap", "count_baseline"])

True

In [66]:
result_d2kab_tomap["count_tomap"].corr(result_d2kab_tomap["count_baseline"], method ='pearson')

0.588192131954007

In [78]:
d2kab_fcu_tomap[:18024]["word"].describe()
counts = d2kab_fcu_tomap[:18024]["word"].str.lower().value_counts()
dico = counts.to_dict()
dico

{'huile': 636,
 'carotte': 616,
 'chou': 554,
 'pois d': 476,
 'colza': 379,
 'grains de pois': 336,
 'légumes': 325,
 'maïs': 272,
 'plein': 254,
 'blé': 232,
 'vigne': 230,
 'plein champ': 220,
 'pois': 207,
 'graine': 198,
 'tomate': 177,
 'sec': 163,
 "huile essentielle d'orange douce": 156,
 'grain de pois': 140,
 'orge': 128,
 'choux': 128,
 'verte': 127,
 'poireau': 119,
 'feuilles du maïs': 117,
 'céleri': 116,
 'oignon': 115,
 'céréales': 110,
 'fleurs': 110,
 'bois': 102,
 'feuilles du colza': 99,
 'pavot': 99,
 'rose': 96,
 'tournesol': 91,
 'blé tendre': 87,
 'chou jean': 86,
 'chou bio': 86,
 'grande': 85,
 'feuille': 82,
 'baie': 80,
 'pousse': 77,
 'protéagineux': 76,
 'aubergine': 72,
 'blette': 72,
 'paille': 69,
 'tête': 69,
 'lin': 67,
 'colza de la surveillance biologique': 66,
 'grain de blé': 65,
 'pomme': 65,
 'asperge': 62,
 'ray': 56,
 'petit pois': 56,
 'massif': 56,
 'jardins': 56,
 'précédent colza': 55,
 'haricot': 54,
 'arbre': 52,
 'culture de haricot': 5

## Vespa

In [67]:
vespa_fcu_tomap, vespa_freq_tomap = read_fcu("../output/vespa-fcu-tomap.csv", "../resources/corpusVespa.html")


Total : 497 bsv dans le corpus



PrefLabel :

count      17356
unique       350
top       jardin
freq         759
Name: prefLabel, dtype: object


BSV :

count                  21149
unique                   788
top       BSV_legumes_11_002
freq                     672
Name: bsv, dtype: object

Top 10 labels :
                          prefLabel  count
0                            jardin    759
1                 jardins familiaux    656
2                           carotte    650
3  culture légumière de plein champ    559
4                              pois    445
5                             colza    377
6                        bigaradier    362
7                            oignon    356
8                           pommier    342
9                      protéagineux    276


In [68]:
vespa_fcu_tomap[:18024]['bsv'].describe()

count                  18024
unique                   432
top       BSV_legumes_11_002
freq                     672
Name: bsv, dtype: object

In [69]:
# save dictionary of extracted labels
label_counter = vespa_fcu_tomap[:18024]['prefLabel'].value_counts()
label_dict = label_counter.to_dict()
vespa_freq_tomap = pd.DataFrame.from_dict(label_dict,orient='index', columns=['count'])
vespa_freq_tomap = vespa_freq_tomap.reset_index().rename({'index':'prefLabel'}, axis = 'columns')
show_frequencies(vespa_freq_tomap)

True

In [70]:
# words that are in baseline vespa but not in tomap
show_differences(vespa_freq, vespa_freq_tomap)

True     177
False     19
Name: prefLabel, dtype: int64
              prefLabel  count
59               agrume     47
105        clémentinier     12
114              ananas     10
122            manguier      8
145              millet      5
147           pissenlit      4
149         mandarinier      4
152           noisetier      4
153              tangor      4
154          cassissier      4
157             gerbera      3
172             figuier      2
180             combava      1
182            phacélie      1
183  oranger du Mexique      1
184             arroche      1
187            échalion      1
191       bananier musa      1
193             chanvre      1


True

In [71]:
# words that are in tomap but not in baseline vespa 
show_differences(vespa_freq_tomap, vespa_freq)

True     177
False    113
Name: prefLabel, dtype: int64
                        prefLabel  count
6                      bigaradier    351
15         arbre fruitier à noyau    188
16        arbre à fruits à coques    180
19                    petits pois    157
23                  haricot mungo    149
..                            ...    ...
284         chicorée industrielle      1
285               trèfle de Perse      1
286                       vétiver      1
287  moutarde blanche oléagineuse      1
289                        manioc      1

[113 rows x 2 columns]


True

In [72]:
# concatenate frequence matrices and keep all values
result_vespa_tomap = pd.concat([vespa_freq_tomap.set_index('prefLabel').rename({'count': 'count_tomap'}, axis='columns'), vespa_freq.set_index('prefLabel').rename({'count': 'count_baseline'}, axis='columns')], axis=1, sort=False)

# if no values were not found, then it the value is set to 0
result_vespa_tomap = result_vespa_tomap.fillna(0).astype(int)

# show
result_vespa_tomap[:10]

Unnamed: 0,count_tomap,count_baseline
jardin,720,127
carotte,646,305
jardins familiaux,602,16
culture légumière de plein champ,528,5
pois,410,494
colza,364,696
bigaradier,351,0
oignon,324,236
pommier,314,742
protéagineux,224,117


In [75]:
scatter_matrix(result_vespa_tomap, list_of_dimensions = ["count_tomap", "count_baseline"])

True

In [76]:
result_vespa_tomap["count_tomap"].corr(result_vespa_tomap["count_baseline"], method ='pearson')

0.5499686604339064

In [77]:
vespa_fcu_tomap[:18024]["word"].describe()
counts = vespa_fcu_tomap[:18024]["word"].str.lower().value_counts()
dico = counts.to_dict()
dico

{'page': 1788,
 'carotte': 600,
 'vergers': 552,
 'chou': 496,
 'arbre': 364,
 'pois': 348,
 'colza': 344,
 'page 2': 342,
 'page 3': 306,
 'plein': 268,
 'jardins': 264,
 'fleurs': 252,
 'page 1': 234,
 'plein champ': 198,
 'blé': 195,
 'protéagineux': 180,
 'sec': 180,
 'maïs': 170,
 'pois destinés': 168,
 "pois destinés à l'alimentation": 168,
 'page 5': 153,
 'légumes': 140,
 'lin de printemps': 136,
 'céréales': 126,
 'pomme': 122,
 'graine': 120,
 'pommier': 119,
 'grande': 116,
 'page 6': 111,
 'bastian': 108,
 'orge': 105,
 'pousse': 104,
 'féverole': 102,
 'lin oléagineux nord': 102,
 'pomme de terre': 99,
 'oignon': 99,
 'oléagineux': 98,
 'lin': 97,
 'rose': 92,
 'poirier': 91,
 'verger': 91,
 'céleri': 90,
 'fleur': 85,
 'huile': 84,
 'pois de conserve': 84,
 'page 7': 84,
 "pois destinés à l'alimentation animale": 84,
 'pois de printemps semés': 84,
 'pois de printemps': 82,
 'page 4': 76,
 'choux': 75,
 'poireau': 71,
 'bois': 70,
 'jardin': 70,
 'vigne': 69,
 "lin d'hive

In [79]:
# ToMap D2KAB vs ToMap Vespa

# concatenate frequence matrices and keep all values
result_all_tomap = pd.concat([d2kab_freq_tomap.set_index('prefLabel').rename({'count': 'count_d2kab'}, axis='columns'), vespa_freq_tomap.set_index('prefLabel').rename({'count': 'count_vespa'}, axis='columns')], axis=1, sort=False)

# if no values were not found, then it the value is set to 0
result_all_tomap = result_all_tomap.fillna(0).astype(int)

# show
result_all_tomap[:10]
result_all_tomap["count_d2kab"].corr(result_all_tomap["count_vespa"], method ='pearson')

0.6804166821131811

# BM25

#### D2KAB

In [80]:
gc = os.listdir("../resources/Corpus-v-12-04-21/train/CorpusTestD2kabGC.html")
maraichage = os.listdir("../resources/Corpus-v-12-04-21/train/CorpusTestD2kabMaraichage.html")
viticulture = os.listdir("../resources/Corpus-v-12-04-21/train/CorpusTestD2kabViti.html")
corpus_len = len(viticulture) + len(maraichage) + len(gc)

In [81]:
bm25_d2kab = pd.read_csv("../output/d2kab/scores/bm25_d2kab.csv", delimiter="\t", encoding='utf-8')
bm25_d2kab.columns = ['bsv', 'prefLabel', 'score']
bm25_bsv = bm25_d2kab['bsv'].tolist()
bm25_theme = []

for fic in bm25_bsv:
    fic = fic + ".html"
    if fic in gc:
        bm25_theme.append("GC")
    if fic in maraichage:
        bm25_theme.append("Maraichage")
    if fic in viticulture:
        bm25_theme.append("Viticulture")
        
bm25_d2kab['theme'] = bm25_theme
bm25_d2kab[:3]

Unnamed: 0,bsv,prefLabel,score,theme
0,20180911_bsv_grandes_cultures-26_cle0f15a8,fourrage annuel,6.527884,GC
1,20180911_bsv_grandes_cultures-26_cle0f15a8,prairie,5.230377,GC
2,20180911_bsv_grandes_cultures-26_cle0f15a8,tournesol,3.814877,GC


In [82]:
bm25_d2kab['prefLabel'].describe()

count      2078
unique      167
top       vigne
freq         83
Name: prefLabel, dtype: object

In [83]:
grouped = bm25_d2kab.groupby("theme")
for name,group in grouped:
    print(f"First 10 entries for {name!r}")
    print("------------------------")
    print(group.head(10), end="\n\n")

First 10 entries for 'GC'
------------------------
                                          bsv        prefLabel     score theme
0  20180911_bsv_grandes_cultures-26_cle0f15a8  fourrage annuel  6.527884    GC
1  20180911_bsv_grandes_cultures-26_cle0f15a8          prairie  5.230377    GC
2  20180911_bsv_grandes_cultures-26_cle0f15a8        tournesol  3.814877    GC
3  20180911_bsv_grandes_cultures-26_cle0f15a8           pêcher  3.560245    GC
4  20180911_bsv_grandes_cultures-26_cle0f15a8             orge  2.616416    GC
5  20180911_bsv_grandes_cultures-26_cle0f15a8             maïs  2.596026    GC
6  20180911_bsv_grandes_cultures-26_cle0f15a8              blé  2.375255    GC
7  20180911_bsv_grandes_cultures-26_cle0f15a8            colza  1.762106    GC
8  20180911_bsv_grandes_cultures-26_cle0f15a8          céréale  1.383466    GC
9  20181002_bsv_grandes_cultures_29_cle0423a6         géranium  6.890343    GC

First 10 entries for 'Maraichage'
------------------------
                    

#### VESPA

In [84]:
bm25_vespa = pd.read_csv("../output/vespa/scores/bm25_vespa.csv", delimiter="\t", encoding='utf-8')
bm25_vespa.columns = ['bsv', 'prefLabel', 'score']
bm25_vespa[:3]

Unnamed: 0,bsv,prefLabel,score
0,01F57Zd3vFWg1itsp44SIT8E,agrume,7.529179
1,01F57Zd3vFWg1itsp44SIT8E,arboriculture fruitière,2.496212
2,02_bsv_raisin_29032012_V3,vigne de table,9.543358


In [85]:
bm25_vespa['prefLabel'].describe()

count             3252
unique             196
top       floriculture
freq               150
Name: prefLabel, dtype: object

#### D2KAB vs VESPA

In [86]:
# agreggate
bm25_d2kab_aggreg = bm25_d2kab.groupby('prefLabel')['score'].aggregate(['min', np.mean, np.median, max])
bm25_vespa_aggreg = bm25_vespa.groupby('prefLabel')['score'].aggregate(['min', np.mean, np.median, max])

In [87]:
# concatenate scores
result_aggreg = pd.concat([bm25_d2kab_aggreg.rename({'min' : 'min_d2kab', 'mean': 'mean_d2kab', 'median' : 'median_d2kab', 'max' : 'max_d2kab'}, axis='columns'), 
                         bm25_vespa_aggreg.rename({'min' : 'min_vespa', 'mean': 'mean_vespa', 'median' : 'median_vespa', 'max' : 'max_vespa'}, axis='columns')], 
                        axis=1, sort=False)
# fill NaN with 0
result_aggreg = result_aggreg.fillna(0)
# show
result_aggreg[:3]

Unnamed: 0,min_d2kab,mean_d2kab,median_d2kab,max_d2kab,min_vespa,mean_vespa,median_vespa,max_vespa
Chou cabus blanc,2.225008,3.888231,4.226845,4.874226,6.300582,7.751765,7.721532,9.233179
abricotier,4.411773,5.203023,5.419163,5.696046,5.899739,6.928682,7.137679,7.78864
abricotier pays,6.310889,6.310889,6.310889,6.310889,0.0,0.0,0.0,0.0


In [88]:
fig = px.bar(result_aggreg, x=result_aggreg.index, y='mean_d2kab')
fig.show()

In [89]:
fig = go.Figure(data=[
    go.Bar(name='d2kab', x=result_aggreg.index, y=result_aggreg['mean_d2kab']),
    go.Bar(name='vespa', x=result_aggreg.index, y=result_aggreg['mean_vespa'])
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

#### Differences

In [90]:
# mean
scatter_matrix(df = result_aggreg, list_of_dimensions = ["mean_d2kab", "mean_vespa"])

True

In [91]:
# median
scatter_matrix(df = result_aggreg, list_of_dimensions = ["median_d2kab", "median_vespa"])

True

#### Similarities

In [92]:
# bm25 mean
cos_mean = cosine_similarity(result_aggreg["mean_d2kab"], result_aggreg["mean_vespa"])

0.7331602910187622


In [93]:
# bm25 median
cos_median = cosine_similarity(result_aggreg["median_d2kab"], result_aggreg["median_vespa"])

0.7423678823386226


# Other

In [None]:
fig = px.bar(result, x=result.index, y=["count_d2kab", "count_vespa"], title="Term Frequency")
fig.update_layout(
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1000, height=1000,
    autosize=False)

fig.show()

In [None]:
fig = px.line(result, x=result.index, y=["count_d2kab", "count_vespa"], title='Term frequency')
fig.show()

In [None]:
fig = px.line(result_mean, x=result_mean.index, y=result_mean.columns, title='BM25')
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=result_mean.index,
    y=result_mean['d2kab'],
    name='D2KAB',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=result_mean.index,
    y=result_mean['vespa'],
    name='Vespa',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45,
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1500, height=1000,
    autosize=False)
fig.show()

In [None]:
fig = px.funnel(result_mean, x=result_mean.columns, y=result_mean.index)
fig.update_layout(
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1000, height=1500,
    autosize=False)
fig.show()

In [None]:
fig = px.bar(result_mean, x=result_mean.index, y=result_mean.columns, title='BM25')

fig.update_layout(
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1000, height=1000,
    autosize=False, barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
#zipf law :
#the most frequent word occurs twice as often as the second most frequent work
#three times as often as the subsequent word, and so on until the least frequent word

import matplotlib.pyplot as plt
from scipy.stats import zipf

inv_map = {v: k for k, v in counts_d2kab.items()}
lst = list(inv_map.items())

plt.bar([key for val, key in lst], [val for val, key in lst], color='limegreen')
alpha = 1.37065874
total = sum([p for p, c in lst])
plt.plot(range(len(lst)), [zipf.pmf(p, alpha) * total for p in range(1, len(lst) + 1)], color='crimson', lw=3)
plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.rcParams["figure.figsize"] = (100,50)
plt.rcParams.update({'font.size': 72})
# plot has more area below the expected zipf curve in higher rank words, and inverse for those that are more important
plt.show()

In [None]:
# At the extreme right end of the plotted line, there are apparent steps, 
# which means that a lot of words have quite low occurences. 
# At the extreme left end, the line is quite flat, 
# which means the frequencies of the first few most frequent words are close that they do not drop as quickly as the ones ranking in the middle.
    
zipfFit(counts_d2kab, 'd2kab', toPlot=True, pr=True, ret=False)

In [None]:
def zipfFit(fdist, name, toPlot=False, pr=False, ret=True):
    fsort_tuple = sorted(fdist.items(), key=operator.itemgetter(1),
    reverse=True)
    y_vals = np.array([t[1] for t in fsort_tuple])
    x_vals = np.array(range(1, len(y_vals) + 1))
    if toPlot:
        plt.title(name)
        plt.rcParams['figure.figsize'] = (50.0, 50.0)
        plt.plot([np.log(x) for x in x_vals], 
            [np.log(y) for y in y_vals], 'ro')
        plt.xlabel('log(rank)')
        plt.ylabel('log(count)')
        plt.show()
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        np.log(x_vals.astype(float)),
        np.log(y_vals.astype(float)))
    if pr:
        print("log-log r-squared for " + name + ":", r_value**2)
    if ret:
        return([slope, intercept, r_value**2])

In [None]:
inv_map = {v: k for k, v in counts_vespa.items()}
lst = list(inv_map.items())

plt.bar([key for val, key in lst], [val for val, key in lst], color='limegreen')
alpha = 1.37065874
total = sum([p for p, c in lst])
plt.plot(range(len(lst)), [zipf.pmf(p, alpha) * total for p in range(1, len(lst) + 1)], color='crimson', lw=3)
plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.rcParams["figure.figsize"] = (100,50)
plt.rcParams.update({'font.size': 72})
plt.show()
# At the extreme right end of the plotted line, there are apparent steps, 
# which means that a lot of words have quite low occurences. 
# At the extreme left end, the line is quite flat, 
# which means the frequencies of the first few most frequent words are close that they do not drop as quickly as the ones ranking in the middle.

    
zipfFit(counts_vespa, 'vespa', toPlot=True, pr=True, ret=False)

In [None]:
#inv_map1 = {v: k for k, v in counts_d2kab.items()}
alpha = 1.37065874
inv_map1 = dict(zip(result.d2kab, result.prefLabel))
lst1 = list(inv_map1.items())
plt.bar([key for val, key in lst1], [val for val, key in lst1], color='limegreen')
total1 = sum([p for p, c in lst1])
plt.plot(range(len(lst1)), [zipf.pmf(p, alpha) * total1 for p in range(1, len(lst1) + 1)], color='cyan', lw=3)



#inv_map2 = {v: k for k, v in counts_vespa.items()}
inv_map2 = dict(zip(result.vespa, result.prefLabel))
lst2 = list(inv_map2.items())
plt.bar([key for val, key in lst2], [val for val, key in lst2], color='red')
total2 = sum([p for p, c in lst2])
plt.plot(range(len(lst2)), [zipf.pmf(p, alpha) * total2 for p in range(1, len(lst2) + 1)], color='crimson', lw=3)

plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.rcParams["figure.figsize"] = (100,50)
plt.rcParams.update({'font.size': 72})
plt.show()