# Annotation of Health Plant Bulletins

In [25]:
# packages
import pandas as pd
import os
import plotly.graph_objects as go
import plotly.express as px

## Corpus

In [52]:
total = 0
for root, dirs, files in os.walk("../resources/corpus/Corpus-v-12-04-21/train/"):
    total += len(files)
print("\nTotal : {} bulletins found".format(total))


Total : 230 bulletins found


In [53]:
grandes_cultures = os.listdir("../resources/corpus/Corpus-v-12-04-21/train/CorpusTestD2kabGC.html")
viticulture = os.listdir("../resources/corpus/Corpus-v-12-04-21/train/CorpusTestD2kabViti.html")
maraichage = os.listdir("../resources/corpus/Corpus-v-12-04-21/train/CorpusTestD2kabMaraichage.html")
print("Grandes cultures : {}".format(len(grandes_cultures)))
print("Viticulture : {}".format(len(viticulture)))
print("Maraichage : {}".format(len(maraichage)))


Grandes cultures : 94
Viticulture : 77
Maraichage : 59


## Workflow
### French Crop Usage

## Workflow
### Phenological stages

In [83]:
stages = pd.read_csv("../output/frenchCropUsage_20210525/d2kab/stages.csv", delimiter="\t", encoding='utf-8') # create df
stages = stages.assign(corpus='D2KAB (test)') # add column with the name of the corpus
stages[:3] # show first 3 elements

Unnamed: 0,bsv,number of words,html tag,context before,form,lemma,prefLabel,context after,location,features,corpus
0,20180911_bsv_grandes_cultures-26_cle0f15a8,2433,html,uvette jaune dès la,levée,levée,stade secondaire BBCH 009,pour suivre les pet,97-102,"type=RDFProjector, uri=http://ontology.inrae.f...",D2KAB (test)
1,20180911_bsv_grandes_cultures-26_cle0f15a8,2433,html,% 8% 8% 8% 0% Semis,Germination,germination,stade principal BBCH 0,levée cotylédons ét,942-953,"type=RDFProjector, uri=http://ontology.inrae.f...",D2KAB (test)
2,20180911_bsv_grandes_cultures-26_cle0f15a8,2433,html,r le semis pour une,levée,levée,stade secondaire BBCH 009,rapide et un bon dé,3024-3029,"type=RDFProjector, uri=http://ontology.inrae.f...",D2KAB (test)


In [47]:
stages['prefLabel'].describe() # show stats on prefLabels

count                            491
unique                            42
top       stade secondaire BBCH 009 
freq                             287
Name: prefLabel, dtype: object

In [74]:
stages['prefLabel'].value_counts()[:10] # show top 10 prefLabels

stade secondaire BBCH 009     287
stade principal BBCH 0         72
stade secondaire BBCH 89       16
stade secondaire BBCH 30       10
stade secondaire BBCH 14        8
stade secondaire BBCH 57        6
stade secondaire BBCH 00        6
stade secondaire BBCH 75        5
stade secondaire BBCH 61        5
stade secondaire BBCH 16        5
Name: prefLabel, dtype: int64

In [49]:
stages['bsv'].describe() # show stats on names of bsv 

count                                          491
unique                                         119
top       BSV_GC_NA_Limousin_13_20190521_cle0cb17e
freq                                            23
Name: bsv, dtype: object

In [84]:
# add types of cultures to df

for name in grandes_cultures: 
    name = name.replace(".html", "")
    for i, n in enumerate(stages['bsv'].to_list()):
        if name == n:
            stages.loc[i, 'culture'] = "GC"
                     
            
for name in viticulture:
    name = name.replace(".html", "")
    for i, n in enumerate(stages['bsv'].to_list()):
        if name == n:
            stages.loc[i, 'culture'] = "Viti"
            
            
for name in maraichage:
    name = name.replace(".html", "")
    for i, n in enumerate(stages['bsv'].to_list()):
        if name == n:
            stages.loc[i, 'culture'] = "Mar"

In [88]:
grouped_stages = stages[[ 'corpus','culture','bsv','prefLabel']].groupby(['corpus', 'culture','bsv', 'prefLabel'])['prefLabel'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['prefLabel'], ascending=True) # group by prefLabel


grouped_stages[:3] # show new df

Unnamed: 0,corpus,culture,bsv,prefLabel,count
0,D2KAB (test),GC,20180911_bsv_grandes_cultures-26_cle0f15a8,stade principal BBCH 0,2
154,D2KAB (test),Mar,BSV_Legumes_n_08_du_17_07_19_cle838d16,stade principal BBCH 0,1
149,D2KAB (test),Mar,2019_BSV_Normandie_leg_22_sem35_cle43a9ca,stade principal BBCH 0,1


In [98]:
grouped_stages[["bsv", "prefLabel", "culture"]].groupby("culture").describe() # group by culture

Unnamed: 0_level_0,bsv,bsv,bsv,bsv,prefLabel,prefLabel,prefLabel,prefLabel
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
culture,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
GC,144,69,BSV_GC_NA_Limousin_13_20190521_cle0cb17e,15,144,34,stade secondaire BBCH 009,59
Mar,37,23,BSV_NA_LEGUMES_PC__INDUSTRIE_08_20190704_cle03...,7,37,11,stade principal BBCH 0,12
Viti,37,27,bsv_viti_lr_n03_16042019_cle82d51a,6,37,16,stade principal BBCH 0,10


In [96]:
# visualize
fig = px.sunburst(grouped_stages,
                  path=[ 'corpus', 'culture','prefLabel'], 
                  values='count')

fig.update_layout(
    showlegend=False,
    font_size=25,
    width=1000, height=1000,
    autosize=False)

fig.show()