In [1]:
import os
import pandas as pd
import numpy as np
import re, operator
from operator import itemgetter
from itertools import islice

# stats

from scipy import stats
from scipy import special
from scipy.spatial.distance import cosine
from scipy.stats import zipf

# dataviz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots




In [2]:
def read_fcu(filename, corpus_name):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # description du corpus
    print("========> Counting bsv..")
    total = 0
    for root, dirs, files in os.walk(corpus_name):
        total += len(files)
    print("\nTotal : {} bsv dans le corpus".format(total))

    
    # create df
    print("\n========> Creating a dataframe..")
    df = pd.read_csv(filename, delimiter="\t", encoding='utf-8')

    
    # short descriptions
    print("\n========> Making short descriptions..")
    print("\nPrefLabel :\n")
    print(df['prefLabel'].describe())
    print("\n\nBSV :\n")
    print(df['bsv'].describe())
    #print("\n\nLength :\n")
    #print(df['length'].describe())
    
    # save dictionary of extracted labels
    label_counter = df['prefLabel'].value_counts()
    label_dict = label_counter.to_dict()
    df_freq = pd.DataFrame.from_dict(label_dict,orient='index', columns=['count'])
    #df_freq.reset_index(level='prefLabel')
    df_freq = df_freq.reset_index().rename({'index':'prefLabel'}, axis = 'columns')
    #df_freq['prefLabel'] = df_freq.index
    print("\nTop 10 labels :\n{}".format(df_freq[:10]))

    


    return df, df_freq



def show_frequencies(df_freq):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # show scatter graph
    fig = px.scatter(df_freq, x=df_freq['prefLabel'], y=df_freq['count'])
    fig.show()
    
    # show tree graph
    fig = px.treemap(df_freq, path=df_freq , values='count')
    fig.show()
    
    return True




def show_differences(df1, df2):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    print(df1['prefLabel'].isin(df2['prefLabel']).value_counts())
    print(df1[~df1['prefLabel'].isin(df2['prefLabel'])])
    
    return True



def scatter_matrix(df, list_of_dimensions):
    
    '''
    Scatter plots show how much one variable is affected by another or the relationship between them with the help of dots in two dimensions. 
    Scatter plots are very much like line graphs in the concept that they use horizontal and vertical axes to plot data points
    
    INPUT :
    OUTPUT :
    '''
    
    fig = px.scatter_matrix(df,
    dimensions=list_of_dimensions,
    color=df.index,
    title="Scatter matrix",
    labels={col:col.replace('_', ' ') for col in df.columns}) # remove underscore
    fig.update_traces(diagonal_visible=True)
    fig.show()
    return True



def cosine_similarity(col1, col2):
    '''
    The cosine measure similarity is a similarity metric that depends on envisioning user preferences as points in space.  
    The cosine of a small angle is near 1, and the cosine of a large angle near 180 degrees is close to –1. 
    This is good, because small angles should map to high similarity, near 1, and large angles should map to near –1
    
    INPUT :
    OUTPUT :
    '''
    
    result = (1 - cosine(col1, col2))
    print(result)
    return result


def load_features(filepath):
    '''
    INPUT :
    OUTPUT :
    '''
    # read csv file
    df = pd.read_csv(filepath, sep = '\t', encoding = 'utf8', names = ['bsv', 'prefLabel', 'score'])
    grouped = df.groupby("bsv")
    for name,group in grouped:
        print(name)
        print(group)
    
    
    # transpose df to a tf-idf matrix
    features = df.pivot(index='prefLabel', columns='bsv', values='score') 
    # replace NaN to null
    features = features.fillna(0)
    
    return features

# Titles vs bm25 scores

In [25]:
fcu, fcu_freq  = read_fcu("../output/d2kab/fcu-baseline.csv", "../resources/Corpus-v-12-04-21/train/")


Total : 230 bsv dans le corpus



PrefLabel :

count      7861
unique      162
top       vigne
freq       1051
Name: prefLabel, dtype: object


BSV :

count                                                  7861
unique                                                  230
top       BSV_legumes_allium_pomme_de_terre_AURA_2019-14...
freq                                                    191
Name: bsv, dtype: object

Top 10 labels :
           prefLabel  count
0              vigne   1051
1              colza    654
2               chou    369
3               maïs    360
4             tomate    346
5    arbuste à baies    286
6            carotte    254
7  culture légumière    237
8         maraîchage    219
9            poireau    204


In [87]:
#header_true = fcu.loc[fcu['html tag'].str.startswith('H', na=False)]
fcu['header'] = 'false'
fcu.loc[fcu['html tag'].str.startswith('H', na=False),'header'] = 'true'
header_true = fcu[['bsv', 'number of words','prefLabel', "header"]]
header_true

Unnamed: 0,bsv,number of words,prefLabel,header
0,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
1,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,maïs,false
2,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
3,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
4,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false
...,...,...,...,...
7856,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false
7857,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false
7858,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false
7859,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false


In [104]:
scores = pd.read_csv("../output/d2kab/bm25-baseline.csv", sep = '\t', encoding = 'utf8', names = ['bsv', 'prefLabel', 'score'])
result = pd.merge(header_true, scores, on=["bsv", "prefLabel"])
result

Unnamed: 0,bsv,number of words,prefLabel,header,score
0,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
1,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
2,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
3,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
4,20180911_bsv_grandes_cultures-26_cle0f15a8,2432,colza,false,1.760917
...,...,...,...,...,...
7856,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,céréale à paille,false,2.865633
7857,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false,7.465499
7858,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false,7.465499
7859,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,4306,gel,false,7.465499


In [114]:
test = pd.DataFrame({'count' : result.groupby( ['bsv','prefLabel', 'score','header'] ).size().unstack(fill_value=0).stack()}).reset_index()
test['total'] = test.groupby(['bsv', 'prefLabel','score'])['count'].transform('sum')
test['percentage'] = test.groupby(['bsv', 'prefLabel','score'])['count'].transform(lambda x: x/x.sum() * 100)
test


Unnamed: 0,bsv,prefLabel,score,header,count,total,percentage
0,20180802_bsvmaraichage_19_cle04c2cf,asperge,3.886483,false,2,2,100.000000
1,20180802_bsvmaraichage_19_cle04c2cf,asperge,3.886483,true,0,2,0.000000
2,20180802_bsvmaraichage_19_cle04c2cf,aubergine,3.543583,false,1,3,33.333333
3,20180802_bsvmaraichage_19_cle04c2cf,aubergine,3.543583,true,2,3,66.666667
4,20180802_bsvmaraichage_19_cle04c2cf,carotte,3.288598,false,4,7,57.142857
...,...,...,...,...,...,...,...
3843,char_gdes_Cultures_no26_du_14-08-19_cle0fb929,pomme de terre potagère,6.427251,true,0,1,0.000000
3844,ge_houblon_no6_du_19-06-19_cle41558d,floriculture,1.904196,false,1,1,100.000000
3845,ge_houblon_no6_du_19-06-19_cle41558d,floriculture,1.904196,true,0,1,0.000000
3846,ge_houblon_no6_du_19-06-19_cle41558d,houblon,10.288002,false,3,4,75.000000


In [140]:
df = test[test.header == "true"]
df

Unnamed: 0,bsv,prefLabel,score,header,count,total,percentage
1,20180802_bsvmaraichage_19_cle04c2cf,asperge,3.886483,true,0,2,0.000000
3,20180802_bsvmaraichage_19_cle04c2cf,aubergine,3.543583,true,2,3,66.666667
5,20180802_bsvmaraichage_19_cle04c2cf,carotte,3.288598,true,3,7,42.857143
7,20180802_bsvmaraichage_19_cle04c2cf,chou,2.352617,true,2,6,33.333333
9,20180802_bsvmaraichage_19_cle04c2cf,ciboulette,2.932009,true,0,1,0.000000
...,...,...,...,...,...,...,...
3839,char_gdes_Cultures_no26_du_14-08-19_cle0fb929,luzerne,6.699416,true,2,10,20.000000
3841,char_gdes_Cultures_no26_du_14-08-19_cle0fb929,pomme de terre,3.761328,true,0,2,0.000000
3843,char_gdes_Cultures_no26_du_14-08-19_cle0fb929,pomme de terre potagère,6.427251,true,0,1,0.000000
3845,ge_houblon_no6_du_19-06-19_cle41558d,floriculture,1.904196,true,0,1,0.000000


In [144]:
fig = px.scatter_3d(test, x='prefLabel', y='bsv', z='score',size='percentage',
              color='header')
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

## Filtered ToMap

In [146]:
bm25 = load_features("../output/d2kab/bm25-tomap.csv")
bm25

20180911_bsv_grandes_cultures-26_cle0f15a8
                                          bsv       prefLabel     score
0  20180911_bsv_grandes_cultures-26_cle0f15a8  grande culture  1.570518
20180920_bsvmaraichage_cle0649bf
                                  bsv          prefLabel     score
240  20180920_bsvmaraichage_cle0649bf             tomate  5.302501
241  20180920_bsvmaraichage_cle0649bf  culture légumière  3.667930
20181002_bsv_grandes_cultures_29_cle0423a6
                                          bsv         prefLabel     score
1  20181002_bsv_grandes_cultures_29_cle0423a6  céréale à paille  3.376802
2  20181002_bsv_grandes_cultures_29_cle0423a6    grande culture  1.553544
20181011_bsvmaraichage_27_cle09c363
                                     bsv          prefLabel     score
242  20181011_bsvmaraichage_27_cle09c363        céleri-rave  7.946300
243  20181011_bsvmaraichage_27_cle09c363             tomate  4.662793
244  20181011_bsvmaraichage_27_cle09c363  culture légumière  3.65394

bsv,20180911_bsv_grandes_cultures-26_cle0f15a8,20180920_bsvmaraichage_cle0649bf,20181002_bsv_grandes_cultures_29_cle0423a6,20181011_bsvmaraichage_27_cle09c363,20181106_bsv_grandes_cultures_34_cle08159b,20190227_BSV_grandes_cultures_Auvergne_N02_cle0189db,20190228_BSV_grandes_cultures_Rhone-Alpes_N_02_cle46d5a8,20190320_LOR_BSV_Grandes_Cultures_cle83816d,20190417_ALS_BSV_Grandes_Cultures_cle8c93cf,20190417_CHA_BSV_Grandes_Cultures_cle8333d7,...,bsv_viti_lr_n17_30072019_cle8cc4c8,bsv_viti_mp_aveyron_n16_30072019_cle0dd53b,bsv_viti_mp_cahors_n17_30072019_cle434e53,bsv_viti_mp_fronton_tarnetgaronne_n13_25062019_cle48ca82,bsv_viti_mp_fronton_tarnetgaronne_n18_30072019_cle4d7faa,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,char_gdes_Cultures_no11_du_30-04-19_cle096f9c,char_gdes_Cultures_no14_du_22-05-19_cle0b1586,char_gdes_Cultures_no26_du_14-08-19_cle0fb929,ge_houblon_no6_du_19-06-19_cle41558d
prefLabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Chou cabus blanc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
asperge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
betterave potagère,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
betterave sucrière,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
blé dur,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
blé tendre,0.0,0.0,0.0,0.0,2.565197,0.0,0.0,3.103215,3.584617,3.514032,...,0.0,0.0,0.0,0.0,0.0,0.0,3.47196,3.168484,0.0,0.0
blé tendre d'hiver,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.833309,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.07009,3.494049,0.0,0.0
chou cabus rouge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chou chinois,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chou de Bruxelles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
from sklearn.decomposition import PCA


pca = PCA()
pca.fit(bm25)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [150]:
pca = PCA(n_components=20)
components = pca.fit_transform(bm25)

fig = px.scatter(components, x=0, y=1, color = bm25.index)
fig.show()

In [151]:
d2kab_fcu_tomap, d2kab_fcu_tomap_freq = read_fcu("../output/d2kab/fcu-tomap.csv", "../resources/Corpus-v-12-04-21/train/")


Total : 230 bsv dans le corpus



PrefLabel :

count               1260
unique                54
top       grande culture
freq                 459
Name: prefLabel, dtype: object


BSV :

count                                 1260
unique                                 137
top       bsv_gc_mp_n27_16052019_cle046a11
freq                                    78
Name: bsv, dtype: object

Top 10 labels :
                        prefLabel  count
0                  grande culture    459
1               culture légumière    137
2                  pomme de terre     95
3                      blé tendre     92
4                         blé dur     58
5                céréale à paille     56
6  pois protéagineux de printemps     42
7                          tomate     32
8               orge de printemps     27
9                    orge d'hiver     27


In [152]:
# show graphs
show_frequencies(d2kab_fcu_tomap_freq)

True

## Baseline

In [153]:
d2kab_fcu_baseline, d2kab_fcu_baseline_freq  = read_fcu("../output/d2kab/fcu-baseline.csv", "../resources/Corpus-v-12-04-21/train/")


Total : 230 bsv dans le corpus



PrefLabel :

count      7861
unique      162
top       vigne
freq       1051
Name: prefLabel, dtype: object


BSV :

count                                                  7861
unique                                                  230
top       BSV_legumes_allium_pomme_de_terre_AURA_2019-14...
freq                                                    191
Name: bsv, dtype: object

Top 10 labels :
           prefLabel  count
0              vigne   1051
1              colza    654
2               chou    369
3               maïs    360
4             tomate    346
5    arbuste à baies    286
6            carotte    254
7  culture légumière    237
8         maraîchage    219
9            poireau    204


In [154]:
# show graphs
show_frequencies(d2kab_fcu_baseline_freq)

True

## Differences

In [155]:
# words in tomap but not in baseline
show_differences(d2kab_fcu_tomap_freq, d2kab_fcu_baseline_freq)

True     49
False     5
Name: prefLabel, dtype: int64
             prefLabel  count
20         topinambour      8
22  betterave sucrière      7
25         petits pois      5
29   culture fruitière      4
48    moutarde blanche      1


True

In [156]:
# concatenate frequence matrices and keep all values
result = pd.concat([d2kab_fcu_baseline_freq.set_index('prefLabel').rename({'count': 'count_baseline'}, axis='columns'), d2kab_fcu_tomap_freq.set_index('prefLabel').rename({'count': 'count_tomap'}, axis='columns')], axis=1, sort=True)

# if no values were not found, then it the value is set to 0
result = result.fillna(0).astype(int)

# show
result[:3]

Unnamed: 0,count_baseline,count_tomap
Chou cabus blanc,5,3
abricotier,5,0
abricotier pays,1,0


In [157]:
scatter_matrix(df = result, list_of_dimensions = ["count_tomap", "count_baseline"])

True