In [28]:
import os
import pandas as pd
import numpy as np
import re, operator
from operator import itemgetter
from itertools import islice

# stats

from scipy import stats
from scipy import special
from scipy.spatial.distance import cosine
from scipy.stats import zipf

# dataviz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


## FCU

In [2]:
stop_list = ["soleil",
             "jachères",
             "gel",
             "marron",
             "orange",
             "fruit",
             "fruits",
             "semence",
             "côte",
             "horticulture",
             "pépinière"]

In [204]:
def read_fcu(filename, corpus_name, stop_list = stop_list):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # description du corpus
    print("Counting bsv..")
    total = 0
    for root, dirs, files in os.walk(corpus_name):
        total += len(files)
    print("\nTotal : {} bsv dans le corpus".format(total))

    
    # create df
    print("\nCreating a dataframe..")
    df = pd.read_csv(filename, delimiter="\t", encoding='utf-8')

    # filter ambigueous words
    print("\nDeleting stopwords..")
    df = df[~df['word'].isin(stop_list)]
    
    # save dictionary of extracted labels
    label_counter = df['prefLabel'].value_counts()
    label_dict = label_counter.to_dict()
    df_freq = pd.DataFrame.from_dict(label_dict,orient='index', columns=['count'])
    #df_freq.reset_index(level='prefLabel')
    df_freq = df_freq.reset_index().rename({'index':'prefLabel'}, axis = 'columns')
    #df_freq['prefLabel'] = df_freq.index
    print("\nPrinting top 10 labels :\n{}".format(df_freq[:10]))
    
    # short descriptions
    print("\nPrefLabel :\n")
    print(df['prefLabel'].describe())
    print("\n\nBSV :\n")
    print(df['bsv'].describe())
    print("\n\nLength :\n")
    print(df['length'].describe())

    return df, df_freq

In [205]:
def show_frequencies(df_freq):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    # show scatter graph
    fig = px.scatter(df_freq, x=df_freq['prefLabel'], y=df_freq['count'])
    fig.show()
    
    # show tree graph
    fig = px.treemap(df_freq, path=[df_freq.index] , values='count')
    fig.show()
    
    return True

In [206]:
def show_differences(df1, df2):
    
    '''
    INPUT :
    OUTPUT :
    '''
    
    print(df1['prefLabel'].isin(df2['prefLabel']).value_counts())
    print(df1[~df1['prefLabel'].isin(df2['prefLabel'])])
    
    return True

### D2KAB

In [207]:
d2kab_fcu, d2kab_freq = read_fcu("../output/d2kab/fcu/fcu_d2kab.csv", "../resources/Corpus-v-12-04-21/train/")

Counting bsv..

Total : 230 bsv dans le corpus

Creating a dataframe..

Deleting stopwords..

Printing top 10 labels :
                 prefLabel  count
0                    vigne   1278
1                    colza    984
2                     chou    595
3             chou potager    595
4                      blé    591
5                  pommier    553
6                     maïs    540
7                   tomate    467
8                  carotte    409
9  arboriculture fruitière    401

PrefLabel :

count     16171
unique      196
top       vigne
freq       1278
Name: prefLabel, dtype: object


BSV :

count                          16171
unique                           378
top       BSV_legumes_15_cle8b2b14-1
freq                             308
Name: bsv, dtype: object


Length :

count    16171.000000
mean     25823.952384
std      17116.169705
min        913.000000
25%      13968.000000
50%      20822.000000
75%      33318.000000
max      78254.000000
Name: length, dtype: float64

In [176]:
# show graphs
show_frequencies(d2kab_freq)

True

### VESPA

In [177]:
vespa_fcu, vespa_freq = read_fcu("../output/vespa/fcu/fcu_vespa.csv", "../resources/corpusVespa.html")

Counting bsv..

Total : 497 bsv dans le corpus

Creating a dataframe..

Deleting stopwords..

Printing top 10 labels :
                 prefLabel  count
0                  pommier    742
1                    colza    696
2  arboriculture fruitière    561
3                     pois    494
4                      blé    394
5                     chou    380
6             chou potager    380
7                  céréale    354
8             floriculture    347
9                  poirier    339

PrefLabel :

count       13259
unique        202
top       pommier
freq          742
Name: prefLabel, dtype: object


BSV :

count                  13259
unique                   444
top       BSV_legumes_11_002
freq                     443
Name: bsv, dtype: object


In [178]:
show_frequencies(vespa_freq)

True

### D2KAB versus VESPA

In [179]:
# words in d2kab that are not in vespa
show_differences(d2kab_freq, vespa_freq)

True     175
False     21
Name: prefLabel, dtype: int64
                             prefLabel  count
92                      grande culture     23
102                        pois chiche     20
121                             cardon     10
130                              ricin      9
146  arboriculture fruitière tropicale      6
151                            potiron      5
153                         potimarron      5
155                           pastèque      5
159                            basilic      4
160                            oseille      4
162                            houblon      4
169                      légume racine      3
177                            ciboule      3
178                   sorgho fourrager      2
179                   courge butternut      2
181                              gesse      2
185                         miscanthus      1
187                    abricotier pays      1
189                    vesce fourrager      1
191                 prai

True

In [180]:
# words in vespa that are not in d2kab
show_differences(vespa_freq, d2kab_freq)

True     175
False     27
Name: prefLabel, dtype: int64
                    prefLabel  count
105  pomme de terre féculière     12
118                  photinie     10
124             canne à sucre      8
133                 limettier      7
138            plante à fibre      6
139   arbre à fruits à coques      6
141              lin à fibres      6
142                    pomelo      6
155                    tangor      4
160               mandarinier      4
163               chou commun      3
166                   gerbera      3
168         cerfeuil tubéreux      3
169                 calebasse      3
171                  rhubarbe      2
172                   lavande      2
177                Chou navet      2
183              christophine      2
184        oranger du Mexique      1
185    lin fibre de printemps      1
186                 chou vert      1
188                     brède      1
189            laitue romaine      1
191                  fenugrec      1
197                

True

In [185]:
# join frequence matrices
result = pd.merge(d2kab_freq, vespa_freq, on="prefLabel")
result

Unnamed: 0,prefLabel,count_x,count_y
0,vigne,1278,314
1,colza,984,696
2,chou,595,380
3,chou potager,595,380
4,blé,591,394
...,...,...,...
170,clémentinier,1,12
171,cognassier,1,2
172,millet,1,5
173,échalion,1,1


In [210]:
fig = px.bar(result, x="prefLabel", y=["count_x", "count_y"], title="Term Frequency")
fig.update_layout(
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1000, height=1000,
    autosize=False)

fig.show()

In [202]:
fig = px.scatter(result, x="prefLabel", y=r, "count_y"], title='Term frequency')
fig.show()

In [239]:
from plotly.subplots import make_subplots

fig = px.scatter_matrix(result,
    dimensions=["count_x", "count_y"],
    color="prefLabel",
    title="Scatter matrix of term Frequency",
    labels={col:col.replace('_', ' ') for col in result.columns}) # remove underscore
fig.update_traces(diagonal_visible=True)
fig.show()




In [199]:
fig = px.line(result, x="prefLabel", y=["count_x", "count_y"], title='Term frequency')
fig.show()

## Score BM25

### D2KAB

In [None]:
gc = os.listdir("../resources/Corpus-v-12-04-21/train/CorpusTestD2kabGC.html")
maraichage = os.listdir("../resources/Corpus-v-12-04-21/train/CorpusTestD2kabMaraichage.html")
viticulture = os.listdir("../resources/Corpus-v-12-04-21/train/CorpusTestD2kabViti.html")
corpus_len = len(viticulture) + len(maraichage) + len(gc)

In [None]:
bm25_d2kab = pd.read_csv("../output/d2kab/scores/bm25_d2kab.csv", delimiter="\t", encoding='utf-8')
bm25_d2kab.columns = ['bsv', 'prefLabel', 'score']
bm25_bsv = bm25_d2kab['bsv'].tolist()
bm25_theme = []

for fic in bm25_bsv:
    fic = fic + ".html"
    if fic in gc:
        bm25_theme.append("GC")
    if fic in maraichage:
        bm25_theme.append("Maraichage")
    if fic in viticulture:
        bm25_theme.append("Viticulture")
        
bm25_d2kab['theme'] = bm25_theme
bm25_d2kab

In [None]:
bm25_d2kab['prefLabel'].describe()

In [None]:
bm25_d2kab.describe()

In [None]:
fig = px.treemap(bm25_d2kab, path=['theme','prefLabel', 'score'])
fig.show()

In [None]:
fig = px.treemap(bm25_d2kab, path=['prefLabel', 'score'])
fig.show()

In [None]:
grouped = bm25_d2kab.groupby("theme")
for name,group in grouped:
    print(f"First 10 entries for {name!r}")
    print("------------------------")
    print(group.head(10), end="\n\n")

### VESPA

In [None]:
bm25vespa = pd.read_csv("../output/vespa/scores/bm25_vespa.csv", delimiter="\t", encoding='utf-8')
bm25vespa.columns = ['bsv', 'prefLabel', 'score']
bm25vespa

In [None]:
bm25vespa['prefLabel'].describe()

In [None]:
bm25vespa.describe()

In [None]:
fig = px.treemap(bm25vespa, path=['prefLabel', 'score'])
fig.show()

### Scores D2KAB vs VESPA

In [None]:
bm25_d2kab_mean = bm25_d2kab.groupby('prefLabel')['score'].agg([pd.np.mean])
bm25_vespa_mean = bm25vespa.groupby('prefLabel')['score'].agg([pd.np.mean])

In [None]:
result_mean = pd.merge(bm25_vespa_mean, bm25_d2kab_mean, on="prefLabel")
result_mean.columns = [ 'd2kab', 'vespa']
result_mean.sort_values(['d2kab', 'vespa'], ascending=[False, False], inplace=True)
result_mean

In [None]:
fig = px.bar(result_mean, x=result_mean.index, y=result_mean.columns, title='BM25')

fig.update_layout(
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1000, height=1000,
    autosize=False, barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
fig = px.line(result_mean, x=result_mean.index, y=result_mean.columns, title='BM25')
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=result_mean.index,
    y=result_mean['d2kab'],
    name='D2KAB',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=result_mean.index,
    y=result_mean['vespa'],
    name='Vespa',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45,
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1500, height=1000,
    autosize=False)
fig.show()

In [None]:
fig = px.funnel(result_mean, x=result_mean.columns, y=result_mean.index)
fig.update_layout(
    margin=dict(t=200, r=200, b=200, l=200),
    showlegend=False,
    width=1000, height=1500,
    autosize=False)
fig.show()

In [None]:
# similarité des corpus
print(1 - cosine(result_mean["d2kab"], result_mean["vespa"]))

In [None]:
#zipf law :
#the most frequent word occurs twice as often as the second most frequent work
#three times as often as the subsequent word, and so on until the least frequent word

import matplotlib.pyplot as plt
from scipy.stats import zipf

inv_map = {v: k for k, v in counts_d2kab.items()}
lst = list(inv_map.items())

plt.bar([key for val, key in lst], [val for val, key in lst], color='limegreen')
alpha = 1.37065874
total = sum([p for p, c in lst])
plt.plot(range(len(lst)), [zipf.pmf(p, alpha) * total for p in range(1, len(lst) + 1)], color='crimson', lw=3)
plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.rcParams["figure.figsize"] = (100,50)
plt.rcParams.update({'font.size': 72})
# plot has more area below the expected zipf curve in higher rank words, and inverse for those that are more important
plt.show()

In [None]:
# At the extreme right end of the plotted line, there are apparent steps, 
# which means that a lot of words have quite low occurences. 
# At the extreme left end, the line is quite flat, 
# which means the frequencies of the first few most frequent words are close that they do not drop as quickly as the ones ranking in the middle.
    
zipfFit(counts_d2kab, 'd2kab', toPlot=True, pr=True, ret=False)

In [None]:
def zipfFit(fdist, name, toPlot=False, pr=False, ret=True):
    fsort_tuple = sorted(fdist.items(), key=operator.itemgetter(1),
    reverse=True)
    y_vals = np.array([t[1] for t in fsort_tuple])
    x_vals = np.array(range(1, len(y_vals) + 1))
    if toPlot:
        plt.title(name)
        plt.rcParams['figure.figsize'] = (50.0, 50.0)
        plt.plot([np.log(x) for x in x_vals], 
            [np.log(y) for y in y_vals], 'ro')
        plt.xlabel('log(rank)')
        plt.ylabel('log(count)')
        plt.show()
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        np.log(x_vals.astype(float)),
        np.log(y_vals.astype(float)))
    if pr:
        print("log-log r-squared for " + name + ":", r_value**2)
    if ret:
        return([slope, intercept, r_value**2])

In [None]:
inv_map = {v: k for k, v in counts_vespa.items()}
lst = list(inv_map.items())

plt.bar([key for val, key in lst], [val for val, key in lst], color='limegreen')
alpha = 1.37065874
total = sum([p for p, c in lst])
plt.plot(range(len(lst)), [zipf.pmf(p, alpha) * total for p in range(1, len(lst) + 1)], color='crimson', lw=3)
plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.rcParams["figure.figsize"] = (100,50)
plt.rcParams.update({'font.size': 72})
plt.show()
# At the extreme right end of the plotted line, there are apparent steps, 
# which means that a lot of words have quite low occurences. 
# At the extreme left end, the line is quite flat, 
# which means the frequencies of the first few most frequent words are close that they do not drop as quickly as the ones ranking in the middle.

    
zipfFit(counts_vespa, 'vespa', toPlot=True, pr=True, ret=False)

In [None]:
#inv_map1 = {v: k for k, v in counts_d2kab.items()}
alpha = 1.37065874
inv_map1 = dict(zip(result.d2kab, result.prefLabel))
lst1 = list(inv_map1.items())
plt.bar([key for val, key in lst1], [val for val, key in lst1], color='limegreen')
total1 = sum([p for p, c in lst1])
plt.plot(range(len(lst1)), [zipf.pmf(p, alpha) * total1 for p in range(1, len(lst1) + 1)], color='cyan', lw=3)



#inv_map2 = {v: k for k, v in counts_vespa.items()}
inv_map2 = dict(zip(result.vespa, result.prefLabel))
lst2 = list(inv_map2.items())
plt.bar([key for val, key in lst2], [val for val, key in lst2], color='red')
total2 = sum([p for p, c in lst2])
plt.plot(range(len(lst2)), [zipf.pmf(p, alpha) * total2 for p in range(1, len(lst2) + 1)], color='crimson', lw=3)

plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.rcParams["figure.figsize"] = (100,50)
plt.rcParams.update({'font.size': 72})
plt.show()