In [1]:
import os
import pandas as pd
import numpy as np
import re, operator
from operator import itemgetter
from itertools import islice

# stats

from scipy import stats
from scipy import special
from scipy.spatial.distance import cosine
from scipy.stats import zipf

# dataviz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots




In [2]:
def read_fcu(filename, corpus_name):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # description du corpus
    print("========> Counting bsv..")
    total = 0
    for root, dirs, files in os.walk(corpus_name):
        total += len(files)
    print("\nTotal : {} bsv dans le corpus".format(total))

    
    # create df
    print("\n========> Creating a dataframe..")
    df = pd.read_csv(filename, delimiter="\t", encoding='utf-8')

    
    # short descriptions
    print("\n========> Making short descriptions..")
    print("\nPrefLabel :\n")
    print(df['prefLabel'].describe())
    print("\n\nBSV :\n")
    print(df['bsv'].describe())
    #print("\n\nLength :\n")
    #print(df['length'].describe())
    
    # save dictionary of extracted labels
    label_counter = df['prefLabel'].value_counts()
    label_dict = label_counter.to_dict()
    df_freq = pd.DataFrame.from_dict(label_dict,orient='index', columns=['count'])
    #df_freq.reset_index(level='prefLabel')
    df_freq = df_freq.reset_index().rename({'index':'prefLabel'}, axis = 'columns')
    #df_freq['prefLabel'] = df_freq.index
    print("\nTop 10 labels :\n{}".format(df_freq[:10]))

    


    return df, df_freq



def show_frequencies(df_freq):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # show scatter graph
    fig = px.scatter(df_freq, x=df_freq['prefLabel'], y=df_freq['count'])
    fig.show()
    
    # show tree graph
    fig = px.treemap(df_freq, path=df_freq , values='count')
    fig.show()
    
    return True




def show_differences(df1, df2):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    print(df1['prefLabel'].isin(df2['prefLabel']).value_counts())
    print(df1[~df1['prefLabel'].isin(df2['prefLabel'])])
    
    return True



def scatter_matrix(df, list_of_dimensions):
    
    '''
    Scatter plots show how much one variable is affected by another or the relationship between them with the help of dots in two dimensions. 
    Scatter plots are very much like line graphs in the concept that they use horizontal and vertical axes to plot data points
    
    INPUT :
    OUTPUT :
    '''
    
    fig = px.scatter_matrix(df,
    dimensions=list_of_dimensions,
    color=df.index,
    title="Scatter matrix",
    labels={col:col.replace('_', ' ') for col in df.columns}) # remove underscore
    fig.update_traces(diagonal_visible=True)
    fig.show()
    return True



def cosine_similarity(col1, col2):
    '''
    The cosine measure similarity is a similarity metric that depends on envisioning user preferences as points in space.  
    The cosine of a small angle is near 1, and the cosine of a large angle near 180 degrees is close to –1. 
    This is good, because small angles should map to high similarity, near 1, and large angles should map to near –1
    
    INPUT :
    OUTPUT :
    '''
    
    result = (1 - cosine(col1, col2))
    print(result)
    return result


def load_features(filepath):
    '''
    INPUT :
    OUTPUT :
    '''
    # read csv file
    df = pd.read_csv(filepath, sep = '\t', encoding = 'utf8', names = ['bsv', 'prefLabel', 'score'])
    grouped = df.groupby("bsv")
    for name,group in grouped:
        print(name)
        print(group)
    
    
    # transpose df to a tf-idf matrix
    features = df.pivot(index='prefLabel', columns='bsv', values='score') 
    # replace NaN to null
    features = features.fillna(0)
    
    return features

# Titles vs bm25 scores

In [25]:
fcu, fcu_freq  = read_fcu("../output/d2kab/fcu-baseline.csv", "../resources/Corpus-v-12-04-21/train/")


Total : 230 bsv dans le corpus



PrefLabel :

count      7861
unique      162
top       vigne
freq       1051
Name: prefLabel, dtype: object


BSV :

count                                                  7861
unique                                                  230
top       BSV_legumes_allium_pomme_de_terre_AURA_2019-14...
freq                                                    191
Name: bsv, dtype: object

Top 10 labels :
           prefLabel  count
0              vigne   1051
1              colza    654
2               chou    369
3               maïs    360
4             tomate    346
5    arbuste à baies    286
6            carotte    254
7  culture légumière    237
8         maraîchage    219
9            poireau    204


In [87]:
#header_true = fcu.loc[fcu['html tag'].str.startswith('H', na=False)]
fcu['header'] = 'false'
fcu.loc[fcu['html tag'].str.startswith('H', na=False),'header'] = 'true'
header_true = fcu[['bsv', 'number of words','prefLabel', "header"]]
header_true

Unnamed: 0,bsv,number of words,prefLabel,header
0,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
1,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,maïs,false
2,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
3,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
4,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
...,...,...,...,...
7856,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false
7857,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false
7858,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false
7859,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false


In [104]:
scores = pd.read_csv("../output/d2kab/bm25-baseline.csv", sep = '\t', encoding = 'utf8', names = ['bsv', 'prefLabel', 'score'])
result = pd.merge(header_true, scores, on=["bsv", "prefLabel"])
result

Unnamed: 0,bsv,number of words,prefLabel,header,score
0,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
1,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
2,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
3,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
4,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
...,...,...,...,...,...
7856,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,céréale à paille,false,2.865633
7857,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false,7.465499
7858,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false,7.465499
7859,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false,7.465499


In [114]:
test = pd.DataFrame({'count' : result.groupby( ['bsv','prefLabel', 'score','header'] ).size().unstack(fill_value=0).stack()}).reset_index()
test['total'] = test.groupby(['bsv', 'prefLabel','score'])['count'].transform('sum')
test['percentage'] = test.groupby(['bsv', 'prefLabel','score'])['count'].transform(lambda x: x/x.sum() * 100)
test


Unnamed: 0,bsv,prefLabel,score,header,count,total,percentage
0,20180802_bsvmaraichage_19_cle04c2cf,asperge,3.886483,false,2,2,100.000000
1,20180802_bsvmaraichage_19_cle04c2cf,asperge,3.886483,true,0,2,0.000000
2,20180802_bsvmaraichage_19_cle04c2cf,aubergine,3.543583,false,1,3,33.333333
3,20180802_bsvmaraichage_19_cle04c2cf,aubergine,3.543583,true,2,3,66.666667
4,20180802_bsvmaraichage_19_cle04c2cf,carotte,3.288598,false,4,7,57.142857
...,...,...,...,...,...,...,...
3843,char_gdes_Cultures_no26_du_14-08-19_cle0fb929,pomme de terre potagère,6.427251,true,0,1,0.000000
3844,ge_houblon_no6_du_19-06-19_cle41558d,floriculture,1.904196,false,1,1,100.000000
3845,ge_houblon_no6_du_19-06-19_cle41558d,floriculture,1.904196,true,0,1,0.000000
3846,ge_houblon_no6_du_19-06-19_cle41558d,houblon,10.288002,false,3,4,75.000000


In [184]:
# mean for headers
df1 = test[test.header == "true"].sort_values(by=['score', 'percentage'])
df1[["score", "percentage"]].mean()


score          3.441290
percentage    16.970755
dtype: float64

In [162]:
# mean for everuthing else
df2 = test[test.header == "false"]
df2[["score", "percentage"]].mean()

score          3.441290
percentage    83.029245
dtype: float64

In [241]:
fig = go.Figure()

fig.add_trace(go.Scatter(mode="markers", x=df1["percentage"], y=df1["score"] ))
fig.add_trace(go.Scatter(mode="markers", x=df2["percentage"], y=df2["score"] ))
#fig.update_xaxes(type="log")
fig.update_xaxes(type="log", range=[0,2]) # log range: 10^0=1, 10^5=100000
fig.update_yaxes(range=[0,15]) # linear range
fig.show()

In [206]:
fig = px.scatter_3d(test, x='prefLabel', y='bsv', z='score',size='percentage',
              color='header')
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

# BM25

## D2KAB

In [265]:
bm25 = load_features("../output/d2kab/bm25.csv")

20180802_bsvmaraichage_19_cle04c2cf
                                     bsv   prefLabel     score
757  20180802_bsvmaraichage_19_cle04c2cf    roquette  8.094595
758  20180802_bsvmaraichage_19_cle04c2cf       radis  5.283447
759  20180802_bsvmaraichage_19_cle04c2cf      panais  5.268137
760  20180802_bsvmaraichage_19_cle04c2cf       noyer  4.467411
761  20180802_bsvmaraichage_19_cle04c2cf   concombre  4.452254
762  20180802_bsvmaraichage_19_cle04c2cf      laitue  4.400949
763  20180802_bsvmaraichage_19_cle04c2cf    fraisier  4.314839
764  20180802_bsvmaraichage_19_cle04c2cf       mâche  4.058745
765  20180802_bsvmaraichage_19_cle04c2cf     asperge  3.943059
766  20180802_bsvmaraichage_19_cle04c2cf       melon  3.663444
767  20180802_bsvmaraichage_19_cle04c2cf   aubergine  3.582590
768  20180802_bsvmaraichage_19_cle04c2cf     poireau  3.343956
769  20180802_bsvmaraichage_19_cle04c2cf     carotte  3.306926
770  20180802_bsvmaraichage_19_cle04c2cf      oignon  3.262259
771  20180802_bsvma

                                        bsv          prefLabel     score
1701  bsv_maraichage_n15_23082018_cle0775f5             céleri  3.728058
1702  bsv_maraichage_n15_23082018_cle0775f5            carotte  3.598586
1703  bsv_maraichage_n15_23082018_cle0775f5         maraîchage  3.576431
1704  bsv_maraichage_n15_23082018_cle0775f5            poireau  3.446529
1705  bsv_maraichage_n15_23082018_cle0775f5             oignon  3.256465
1706  bsv_maraichage_n15_23082018_cle0775f5             salade  3.125221
1707  bsv_maraichage_n15_23082018_cle0775f5               chou  2.521732
1708  bsv_maraichage_n15_23082018_cle0775f5  culture légumière  2.312222
1709  bsv_maraichage_n15_23082018_cle0775f5       floriculture  1.716905
bsv_maraichage_n19_10102019_cle038cd4
                                        bsv   prefLabel     score
1710  bsv_maraichage_n19_10102019_cle038cd4     carotte  3.586800
1711  bsv_maraichage_n19_10102019_cle038cd4      céleri  3.582231
1712  bsv_maraichage_n19_10102019_

In [266]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(bm25)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [267]:
pca = PCA(n_components=50)
components = pca.fit_transform(bm25)
fig = px.scatter(components, x=0, y=1, color = bm25.index )

fig.show()

## VESPA

In [259]:
bm25 = load_features("../output/vespa/bm25.csv")

01F57Zd3vFWg1itsp44SIT8E
                        bsv                prefLabel     score
0  01F57Zd3vFWg1itsp44SIT8E             clémentinier  8.964703
1  01F57Zd3vFWg1itsp44SIT8E                   agrume  7.507696
2  01F57Zd3vFWg1itsp44SIT8E                tournesol  3.453663
3  01F57Zd3vFWg1itsp44SIT8E  arboriculture fruitière  2.483649
4  01F57Zd3vFWg1itsp44SIT8E        culture fruitière  2.352980
02_bsv_raisin_29032012_V3
                         bsv          prefLabel     score
5  02_bsv_raisin_29032012_V3     vigne de table  9.560080
6  02_bsv_raisin_29032012_V3              vigne  3.558094
7  02_bsv_raisin_29032012_V3  culture fruitière  2.352980
04_bsv_raisin_12042012_V3
                          bsv          prefLabel     score
8   04_bsv_raisin_12042012_V3     vigne de table  9.381179
9   04_bsv_raisin_12042012_V3            semence  3.968530
10  04_bsv_raisin_12042012_V3              vigne  3.516966
11  04_bsv_raisin_12042012_V3  zone non agricole  3.415090
12  04_bsv_raisin_

1432  2.709703  
BSV_Legumes_n_09_du_20_decembre_2012
                                       bsv           prefLabel     score
1433  BSV_Legumes_n_09_du_20_decembre_2012  betterave potagère  7.449923
1434  BSV_Legumes_n_09_du_20_decembre_2012              oignon  5.637833
1435  BSV_Legumes_n_09_du_20_decembre_2012               radis  5.253666
1436  BSV_Legumes_n_09_du_20_decembre_2012       navet potager  4.766200
1437  BSV_Legumes_n_09_du_20_decembre_2012   culture légumière  4.164248
1438  BSV_Legumes_n_09_du_20_decembre_2012      pomme de terre  4.032472
1439  BSV_Legumes_n_09_du_20_decembre_2012              poirée  3.977523
1440  BSV_Legumes_n_09_du_20_decembre_2012             carotte  3.460076
1441  BSV_Legumes_n_09_du_20_decembre_2012             céréale  2.755995
1442  BSV_Legumes_n_09_du_20_decembre_2012             pommier  2.587137
BSV_Legumes_no13ERRATUM_cle816453
                                    bsv                      prefLabel  \
1443  BSV_Legumes_no13ERRATUM_cle81

2990  bsv_12_gc_10_mai_12_cle8df51d                    floriculture  1.257166
bsv_13_pdt_25_juin_13_cle453463
                                  bsv                 prefLabel     score
2991  bsv_13_pdt_25_juin_13_cle453463  pomme de terre féculière  7.528596
2992  bsv_13_pdt_25_juin_13_cle453463                  roquette  6.653877
2993  bsv_13_pdt_25_juin_13_cle453463            pomme de terre  3.606674
2994  bsv_13_pdt_25_juin_13_cle453463                       blé  2.804930
bsv_2014_arbo_39_cle4cea53
                             bsv                 prefLabel     score
2995  bsv_2014_arbo_39_cle4cea53                abricotier  5.774403
2996  bsv_2014_arbo_39_cle4cea53               framboisier  5.520812
2997  bsv_2014_arbo_39_cle4cea53                maraîchage  4.438903
2998  bsv_2014_arbo_39_cle4cea53                   prunier  4.378514
2999  bsv_2014_arbo_39_cle4cea53                  fraisier  4.104452
3000  bsv_2014_arbo_39_cle4cea53                   poirier  3.935048
3001  bsv_

In [260]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(bm25)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [264]:
pca = PCA(n_components=100)
components = pca.fit_transform(bm25)
fig = px.scatter(components, x=0, y=1, color = bm25.index )

fig.show()

## ALEA

In [268]:
bm25 = load_features("../output/alea/bm25.csv")

01_TB_Als_cle8c6253
                   bsv         prefLabel     score
0  01_TB_Als_cle8c6253        miscanthus  7.780013
1  01_TB_Als_cle8c6253             tabac  5.888391
2  01_TB_Als_cle8c6253  céréale à paille  3.328540
05_GC_Als_cle8c3f61
                   bsv         prefLabel     score
3  05_GC_Als_cle8c3f61              orge  3.699944
4  05_GC_Als_cle8c3f61  céréale à paille  3.699944
5  05_GC_Als_cle8c3f61           poireau  3.586775
6  05_GC_Als_cle8c3f61              maïs  3.147835
7  05_GC_Als_cle8c3f61           céréale  2.773716
8  05_GC_Als_cle8c3f61               blé  2.660163
9  05_GC_Als_cle8c3f61             colza  2.067062
05bsv_jevi_20190502_cle8f22b6
                              bsv                      prefLabel     score
10  05bsv_jevi_20190502_cle8f22b6              jardins familiaux  5.337574
11  05bsv_jevi_20190502_cle8f22b6                         rosier  5.282588
12  05bsv_jevi_20190502_cle8f22b6                           fève  4.591436
13  05bsv_jevi_201

In [269]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(bm25)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [270]:
pca = PCA(n_components=40)
components = pca.fit_transform(bm25)
fig = px.scatter(components, x=0, y=1, color = bm25.index )

fig.show()