In [1]:
import pandas as pd
import spacy
import os
import climdist.ner.doccano_transformations as dt
from spacy.util import minibatch, compounding 
from spacy.training import Example
from spacy import displacy
import glob

In [2]:
# importer les données saisies à la main depuis un fichier Excel.
# il y a surtout des tempêtes, mais aussi un certain nombre des autres phénomènes
df = pd.read_excel('../data/processed/storms_for_annotation.xlsx')

# supprimer les colonnes vides
df = df[['YEAR', 'MONTH', 'DATE_BEGIN', 'DATE_END', 'CAT_ID', 'COMP_ID','LOD_ID', 'EXC', 'COMMENT', 'LINK']]

# certaines entrées n'ont pas de l'extrait de la source - les supprimer
df = df.loc[pd.notna(df['EXC'])]

# créer une colonne qui mésure la longueur des extraits (len(str))
df['len'] = df['EXC'].apply(len)
df.head()

Unnamed: 0,YEAR,MONTH,DATE_BEGIN,DATE_END,CAT_ID,COMP_ID,LOD_ID,EXC,COMMENT,LINK,len
0,1881,12.0,30,30.0,"1:1:5:4:, 1:1:4:2:",,lv:ml:riga:,Am Vor- und Nachmittag Regen und Sturm,,http://periodika.lv/periodika2-viewer/view/ind...,38
2,1812,12.0,18,18.0,"1:1:5:4:, 1:1:4:5:",,lv:ml:riga:,Sturm aus N,,http://periodika.lv/periodika2-viewer/view/ind...,11
3,1855,12.0,22,,"1:1:5:4:, 1:1:4:2:",,ee:ne:tallinn:,Bei gelinder Witterung liefen im vorigen Monat...,Tallinna kirjasaatja 2. jaanuaril,http://periodika.lv/periodika2-viewer/view/ind...,395
4,1891,12.0,25,27.0,1:1:5:4:,,lv:wl:riga:,"Die stürmische Witterung, welche am Morgen des...",,http://periodika.lv/periodika2-viewer/view/ind...,149
5,1891,12.0,25,26.0,1:1:5:4:,,lv:wl:vp:pope:,"Aus Popen (Kurland) wird uns geschrieben, daß ...",,http://periodika.lv/periodika2-viewer/view/ind...,219


In [3]:
# créer un nouvel dataframe qui ne contient que les entrées dont la longueur de l'extrait dépasse une longueur donnée
# (j'ai choisi 100 charactères pour donner un peu plus de contexte et éliminer les descriptions très laconiques d'un point de vue NLP)
# cependant, la limite est tout à faite arbitraire à ce moment
shortdf = df[df['len'] > 99]

# mettre en place un ordre chronologique pour le dataframe
shortdf = shortdf.sort_values(['YEAR', 'MONTH', 'DATE_BEGIN'])

#shortdf.head()

In [4]:
shortdf.describe()

Unnamed: 0,YEAR,MONTH,len
count,567.0,562.0,567.0
mean,1866.77425,7.208185,242.645503
std,27.515852,3.350218,166.816967
min,1603.0,1.0,100.0
25%,1847.0,4.0,142.0
50%,1880.0,8.0,191.0
75%,1885.0,10.0,275.5
max,1899.0,12.0,1525.0


In [None]:
# écrire tous les extraits sélectionnés dans un fichier .txt

with open('storms_spacy.txt', 'w', encoding = 'utf8') as myfile:
    for txt in shortdf['EXC']:
        storm = txt + '\n\n'
        myfile.write(storm)

In [None]:
# j'ai décidé d'experimenter avec Spacy, mais il reste à trouver si il y a des modèles pour l'allemand historique, p. ex. en NLTK
nlp = spacy.load("de_core_news_md")

In [None]:
with open ('storms_spacy.txt', 'r', encoding='utf8') as myfile:
    data = myfile.read()
    
doc = nlp(data)
doc

In [None]:
# une petite exploration des entités nommées. étonnamment, Spacy est assez performant pour la catégorie LOC
# en plus, il reconnait un certain nombre des phénomènes météorologiques (voir #100, #103, #108-110, #112, #119, #146 etc),
# mais en les étiquetant comme des ORG, LOC ou PER.

ents_dico = {'text':[ent.text for ent in doc.ents], 'label':[ent.label_ for ent in doc.ents]}

ents_df = pd.DataFrame(data = ents_dico, columns = ['text', 'label'])
ents_df[100:115]

In [32]:
train_data = dt.doccano_to_spacy('../pipeline/03_ner/01_doccano/storms_annotated_19.03v2.jsonl')
train_data = dt.doccano_strip(train_data)
train_data = dt.doccano_strip(train_data)
train_data = dt.wea_to_nat(train_data)
#print(train_data)

In [None]:
ner = nlp.get_pipe("ner")

for annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])
        
ner.labels

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(20):
        print("iteration: "+str(itn))
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for ba in batch:
                examples.append(Example.from_dict(nlp.make_doc(ba["text"]), ba))
                nlp.update(examples)        
print("training is finished")

In [None]:
output_dir = Path("storms_ner_model")

In [None]:
if not output_dir.exists():
    output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

In [None]:
print("Loading from", output_dir)
nlp_test = spacy.load(output_dir)
print("Loading finished")

In [None]:
displacy_color_code = {'WEA': '#4cafd9',
                  'PER': '#ffb366',
                  'DAT': '#bf80ff',
                  'LOC': '#a88676',
                  'MISC': 'grey',
                  'MEA': '#85e085',
                  'ORG': '#5353c6'}

displacy_options = {'ents': ['WEA', 'PER', 'DAT', 'LOC', 'MISC', 'MEA', 'ORG'], 'colors': displacy_color_code}

In [None]:
def nlp_test_generator(filepath):
    
    for file in glob.glob(filepath):
        docname = path.splitext(path.basename(file))[0]
        print(docname)
        with open(file, 'r', encoding='utf8') as f:
            doctext = f.read()
            doctext = doctext.replace('\n', ' ')
        
        yield docname, nlp_test(doctext)

In [None]:
tp_tests_gen = nlp_test_generator('./test_data/TP_tests/*.txt')

tp_tests = [docfile for docname, docfile in tp_tests_gen]

html = displacy.render(tp_tests, style='ent', page=True, jupyter=False, options=displacy_options)
with open('./test_data/TP_tests/tp_tests.html', 'w', encoding='utf8') as f:
    f.write(html)
    print('finished')

In [None]:
fp_tests_gen = nlp_test_generator('./test_data/FP_tests/*.txt')

fp_tests = [docfile for docname, docfile in fp_tests_gen]

html = displacy.render(fp_tests, style='ent', page=True, jupyter=False, options=displacy_options)
with open('./test_data/FP_tests/fp_tests.html', 'w', encoding='utf8') as f:
    f.write(html)
    print('finished')

In [None]:
with open('./test_data/FP_tests/LZ_Nr002_1862.txt', 'r', encoding='utf8') as f:
    data = f.read()
    data = data.replace('\n', ' ')
    doc_lz = nlp_test(data)

In [None]:
displacy.render(doc_lz, style='ent', jupyter = True, options = displacy_options)

In [None]:
doc2 = nlp_test('Hamburg, 3. Jan. Als Vorläuferin der Unglücksbot ſchaften, welche wir von der See her nach dem letzten Sturm zu erwarten haben, iſt die heute eingetroffene Nachricht, daß das Packetſchiff „George Canning“, Hrn. R. M. Slomann')

In [None]:
displacy.render(doc2, style='ent', jupyter = True, options = displacy_options)

In [None]:
nlp.pipe_names

In [None]:
config = {"overwrite_ents": True }
ruler = nlp.add_pipe('entity_ruler', config=config)

In [None]:
nlp.pipe_names

In [None]:
natural_phenomena = ["Sturm", "Hagel"]
for n in natural_phenomena:
    ruler.add_patterns([{"label": "NAT", "pattern": n}])

In [None]:
doc3 = nlp('Heute haben wir einen starken Sturm un vielen Hagel gehabt.')

In [None]:
for ent in doc3.ents:
    print(ent.text, ent.label_)