In [19]:
import pandas as pd
import spacy
import os
import doccano_transformations as dt
from pathlib import Path
from spacy.util import minibatch, compounding 
from spacy.training import Example
from spacy import displacy
import glob
from os import path

In [2]:
# importer les données saisies à la main depuis un fichier Excel.
# il y a surtout des tempêtes, mais aussi un certain nombre des autres phénomènes
df = pd.read_excel(r'storms_for_spacy.xlsx')

# supprimer les colonnes vides
df = df[['YEAR', 'MONTH', 'DATE_BEGIN', 'DATE_END', 'CAT_ID', 'COMP_ID','LOD_ID', 'EXC', 'COMMENT', 'LINK']]

# certaines entrées n'ont pas de l'extrait de la source - les supprimer
df = df.loc[pd.notna(df['EXC'])]

# créer une colonne qui mésure la longueur des extraits (len(str))
df['len'] = df['EXC'].apply(len)
df.head()

Unnamed: 0,YEAR,MONTH,DATE_BEGIN,DATE_END,CAT_ID,COMP_ID,LOD_ID,EXC,COMMENT,LINK,len
0,1881,12.0,30,30.0,"1:1:5:4:, 1:1:4:2:",,lv:ml:riga:,Am Vor- und Nachmittag Regen und Sturm,,http://periodika.lv/periodika2-viewer/view/ind...,38
2,1812,12.0,18,18.0,"1:1:5:4:, 1:1:4:5:",,lv:ml:riga:,Sturm aus N,,http://periodika.lv/periodika2-viewer/view/ind...,11
3,1855,12.0,22,,"1:1:5:4:, 1:1:4:2:",,ee:ne:tallinn:,Bei gelinder Witterung liefen im vorigen Monat...,Tallinna kirjasaatja 2. jaanuaril,http://periodika.lv/periodika2-viewer/view/ind...,395
4,1891,12.0,25,27.0,1:1:5:4:,,lv:wl:riga:,"Die stürmische Witterung, welche am Morgen des...",,http://periodika.lv/periodika2-viewer/view/ind...,149
5,1891,12.0,25,26.0,1:1:5:4:,,lv:wl:vp:pope:,"Aus Popen (Kurland) wird uns geschrieben, daß ...",,http://periodika.lv/periodika2-viewer/view/ind...,219


In [3]:
# créer un nouvel dataframe qui ne contient que les entrées dont la longueur de l'extrait dépasse une longueur donnée
# (j'ai choisi 100 charactères pour donner un peu plus de contexte et éliminer les descriptions très laconiques d'un point de vue NLP)
# cependant, la limite est tout à faite arbitraire à ce moment
shortdf = df[df['len'] > 99]

# mettre en place un ordre chronologique pour le dataframe
shortdf = shortdf.sort_values(['YEAR', 'MONTH', 'DATE_BEGIN'])

#shortdf.head()

In [5]:
# écrire tous les extraits sélectionnés dans un fichier .txt

with open('storms_spacy.txt', 'w', encoding = 'utf8') as myfile:
    for txt in shortdf['EXC']:
        storm = txt + '\n\n'
        myfile.write(storm)

In [20]:
# j'ai décidé d'experimenter avec Spacy, mais il reste à trouver si il y a des modèles pour l'allemand historique, p. ex. en NLTK
nlp = spacy.load("de_core_news_md")

In [7]:
with open ('storms_spacy.txt', 'r', encoding='utf8') as myfile:
    data = myfile.read()
    
doc = nlp(data)
doc

1603 den 4 und 5 octob zu Riga ein großer Sturm gewesen, daß etzliche Strußen bey dem Bolwerck in der Düna zu gründe gegangen, und etzliche schiffe die mästen in der Düna für der Stadt haben hauen müssen.

Bollwerk wird zerstört durch Sturm, 80 Kanonen und Schiff werden zerstört, weitere Schiffe beschädigt

in diesem 1716ten Jahre wehete bis in den Janius starke Sturmwinde, und in den Hundstagen war es so kalt, als es sonst im Oktobermonat zu sein pflegt; auch dan ganzen Sommer durch war bei Stadt ein so hoch aufgeschwollener Storm, daß man die Brücke nicht zu Stande bringen konnte

fing an die neue gegossene Glocke, in dem Peters Thurm, ordentlich die Stunden zu schlagen; was seit dem ... Brand 1721 den 10. Mai, da von einem zündenden Donner-Wetter ... nicht geschehen

den 6. Decbr. fror es so stark, daß man schon am 6ten morgens über den Strom gehen konnte, am 14ten kam Sturm und Regen, der das Eis wieder mürbe machte; den 18ten gieng es aus

1774, den 19. April, nachmittags erhob si

In [9]:
# une petite exploration des entités nommées. étonnamment, Spacy est assez performant pour la catégorie LOC
# en plus, il reconnait un certain nombre des phénomènes météorologiques (voir #100, #103, #108-110, #112, #119, #146 etc),
# mais en les étiquetant comme des ORG, LOC ou PER.

ents_dico = {'text':[ent.text for ent in doc.ents], 'label':[ent.label_ for ent in doc.ents]}

ents_df = pd.DataFrame(data = ents_dico, columns = ['text', 'label'])
ents_df[100:115]

Unnamed: 0,text,label
100,uns Gewitter,ORG
101,Wesenberg,LOC
102,Kirche,ORG
103,Blitzstrahl,PER
104,Ockt\n\nReval,MISC
105,Jerwen,LOC
106,Wierland,LOC
107,Carolenschen,LOC
108,Gewitter,LOC
109,Gewitter,LOC


In [19]:
train_data = dt.doccano_to_spacy('storms_annotated_19.03v2.jsonl')
train_data = dt.doccano_strip(train_data)

In [14]:
ner = nlp.get_pipe("ner")

for annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])
        
ner.labels

('DAT', 'LOC', 'MEA', 'MISC', 'ORG', 'PER', 'WEA')

In [14]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(20):
        print("iteration: "+str(itn))
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for ba in batch:
                examples.append(Example.from_dict(nlp.make_doc(ba["text"]), ba))
                nlp.update(examples)        
print("training is finished")

iteration: 0




iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
training is finished


In [7]:
output_dir = Path("storms_ner_model")

In [15]:
if not output_dir.exists():
    output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to storms_ner_model


In [9]:
print("Loading from", output_dir)
nlp_test = spacy.load(output_dir)
print("Loading finished")

Loading from storms_ner_model
Loading finished


In [14]:
displacy_color_code = {'WEA': '#4cafd9',
                  'PER': '#ffb366',
                  'DAT': '#bf80ff',
                  'LOC': '#a88676',
                  'MISC': 'grey',
                  'MEA': '#85e085',
                  'ORG': '#5353c6'}

displacy_options = {'ents': ['WEA', 'PER', 'DAT', 'LOC', 'MISC', 'MEA', 'ORG'], 'colors': displacy_color_code}

In [None]:
{13437: {'DAT': [0, 0, 192, 0],
  'LOC': [15, 4, 170, 7],
  'MEA': [0, 0, 192, 0],
  'MISC': [0, 1, 192, 0],
  'ORG': [0, 1, 192, 0],
  'PER': [0, 0, 192, 0],
  'WEA': [1, 0, 191, 0]},
 10711: {'DAT': [0, 0, 204, 0],
  'LOC': [16, 8, 186, 2],
  'MEA': [0, 0, 204, 0],
  'MISC': [1, 1, 203, 0],
  'ORG': [1, 2, 202, 1],
  'PER': [0, 0, 199, 5],
  'WEA': [1, 0, 203, 0]},
 24081: {'DAT': [7, 0, 225, 0],
  'LOC': [7, 4, 221, 4],
  'MEA': [0, 0, 232, 0],
  'MISC': [0, 0, 232, 0],
  'ORG': [1, 1, 231, 0],
  'PER': [0, 0, 230, 2],
  'WEA': [1, 0, 231, 0]},
 969: {'DAT': [2, 0, 276, 0],
  'LOC': [4, 1, 271, 3],
  'MEA': [0, 0, 278, 0],
  'MISC': [0, 0, 278, 0],
  'ORG': [0, 0, 278, 0],
  'PER': [1, 0, 276, 1],
  'WEA': [1, 1, 277, 0]},
 26801: {'DAT': [1, 0, 291, 1],
  'LOC': [6, 1, 285, 2],
  'MEA': [0, 0, 293, 0],
  'MISC': [4, 0, 287, 2],
  'ORG': [1, 0, 291, 1],
  'PER': [1, 0, 292, 0],
  'WEA': [2, 1, 290, 1]},
 19923: {'DAT': [0, 0, 169, 0],
  'LOC': [1, 3, 168, 0],
  'MEA': [0, 0, 169, 0],
  'MISC': [1, 0, 168, 0],
  'ORG': [0, 2, 169, 0],
  'PER': [7, 0, 158, 4],
  'WEA': [0, 2, 169, 0]},
 24113: {'DAT': [0, 0, 244, 0],
  'LOC': [0, 1, 244, 0],
  'MEA': [0, 0, 244, 0],
  'MISC': [0, 1, 244, 0],
  'ORG': [0, 1, 244, 0],
  'PER': [0, 0, 244, 0],
  'WEA': [3, 1, 241, 0]},
 371: {'DAT': [2, 0, 282, 1],
  'LOC': [17, 1, 261, 7],
  'MEA': [0, 0, 285, 0],
  'MISC': [0, 0, 285, 0],
  'ORG': [0, 0, 285, 0],
  'PER': [15, 0, 264, 6],
  'WEA': [0, 2, 285, 0]},
 11159: {'DAT': [0, 0, 194, 0],
  'LOC': [24, 2, 164, 6],
  'MEA': [0, 0, 194, 0],
  'MISC': [0, 0, 194, 0],
  'ORG': [0, 0, 194, 0],
  'PER': [1, 2, 193, 0],
  'WEA': [1, 0, 193, 0]},
 20285: {'DAT': [4, 7, 259, 0],
  'LOC': [7, 2, 256, 0],
  'MEA': [0, 2, 263, 0],
  'MISC': [0, 0, 263, 0],
  'ORG': [1, 1, 262, 0],
  'PER': [18, 0, 234, 11],
  'WEA': [0, 2, 263, 0]},
 6855: {'DAT': [1, 0, 329, 0],
  'LOC': [0, 16, 330, 0],
  'MEA': [0, 0, 330, 0],
  'MISC': [0, 0, 330, 0],
  'ORG': [0, 0, 330, 0],
  'PER': [4, 2, 314, 12],
  'WEA': [0, 2, 330, 0]},
 4806: {'DAT': [0, 0, 90, 7],
  'LOC': [0, 0, 97, 0],
  'MEA': [0, 1, 97, 0],
  'MISC': [0, 0, 97, 0],
  'ORG': [0, 1, 97, 0],
  'PER': [0, 0, 97, 0],
  'WEA': [6, 1, 87, 4]},
 31670: {'DAT': [0, 0, 46, 0],
  'LOC': [1, 2, 45, 0],
  'MEA': [0, 0, 46, 0],
  'MISC': [0, 0, 46, 0],
  'ORG': [0, 0, 46, 0],
  'PER': [0, 0, 46, 0],
  'WEA': [0, 0, 45, 1]},
 9485: {'DAT': [6, 0, 131, 0],
  'LOC': [10, 2, 127, 0],
  'MEA': [0, 0, 137, 0],
  'MISC': [0, 0, 136, 1],
  'ORG': [0, 0, 137, 0],
  'PER': [0, 0, 137, 0],
  'WEA': [1, 1, 136, 0]},
 31185: {'DAT': [2, 0, 262, 0],
  'LOC': [12, 2, 246, 6],
  'MEA': [0, 0, 263, 1],
  'MISC': [0, 1, 264, 0],
  'ORG': [0, 0, 264, 0],
  'PER': [0, 0, 264, 0],
  'WEA': [11, 1, 253, 0]},
 2644: {'DAT': [2, 0, 223, 1],
  'LOC': [10, 0, 213, 3],
  'MEA': [0, 0, 226, 0],
  'MISC': [0, 0, 226, 0],
  'ORG': [0, 0, 226, 0],
  'PER': [0, 1, 226, 0],
  'WEA': [8, 0, 218, 0]},
 4296: {'DAT': [4, 1, 276, 0],
  'LOC': [5, 5, 275, 0],
  'MEA': [0, 1, 280, 0],
  'MISC': [1, 0, 279, 0],
  'ORG': [0, 1, 280, 0],
  'PER': [1, 1, 272, 7],
  'WEA': [1, 3, 279, 0]},
 30345: {'DAT': [0, 0, 190, 6],
  'LOC': [1, 0, 195, 0],
  'MEA': [1, 0, 165, 30],
  'MISC': [0, 1, 196, 0],
  'ORG': [0, 8, 196, 0],
  'PER': [0, 0, 196, 0],
  'WEA': [4, 0, 192, 0]},
 11798: {'DAT': [0, 0, 203, 0],
  'LOC': [31, 7, 166, 6],
  'MEA': [0, 0, 203, 0],
  'MISC': [0, 0, 203, 0],
  'ORG': [0, 1, 202, 1],
  'PER': [0, 0, 202, 1],
  'WEA': [1, 2, 202, 0]},
 832: {'DAT': [1, 0, 124, 0],
  'LOC': [4, 0, 120, 1],
  'MEA': [0, 0, 125, 0],
  'MISC': [0, 0, 125, 0],
  'ORG': [0, 0, 125, 0],
  'PER': [0, 0, 125, 0],
  'WEA': [0, 0, 125, 0]}}

In [122]:
def nlp_test_generator(filepath):
    
    for file in glob.glob(filepath):
        docname = path.splitext(path.basename(file))[0]
        print(docname)
        with open(file, 'r', encoding='utf8') as f:
            doctext = f.read()
            doctext = doctext.replace('\n', ' ')
        
        yield docname, nlp_test(doctext)

In [146]:
tp_tests_gen = nlp_test_generator('./test_data/TP_tests/*.txt')

tp_tests = [docfile for docname, docfile in tp_tests_gen]

html = displacy.render(tp_tests, style='ent', page=True, jupyter=False, options=displacy_options)
with open('./test_data/TP_tests/tp_tests.html', 'w', encoding='utf8') as f:
    f.write(html)
    print('finished')

DZ_Nr098_1900
DZ_Nr210_1900
DZ_Nr238_1901
RZ_Nr109_1907
RZ_Nr246_1907
finished


In [145]:
fp_tests_gen = nlp_test_generator('./test_data/FP_tests/*.txt')

fp_tests = [docfile for docname, docfile in fp_tests_gen]

html = displacy.render(fp_tests, style='ent', page=True, jupyter=False, options=displacy_options)
with open('./test_data/FP_tests/fp_tests.html', 'w', encoding='utf8') as f:
    f.write(html)
    print('finished')

DZ_Nr096_1901
DZ_Nr103_1901
DZ_Nr127_1901
DZ_Nr251_1901
DZ_Nr284_1901
finished


In [12]:
with open('./test_data/FP_tests/LZ_Nr002_1862.txt', 'r', encoding='utf8') as f:
    data = f.read()
    data = data.replace('\n', ' ')
    doc_lz = nlp_test(data)

In [15]:
displacy.render(doc_lz, style='ent', jupyter = True, options = displacy_options)

In [19]:
doc2 = nlp_test('Hamburg, 3. Jan. Als Vorläuferin der Unglücksbot ſchaften, welche wir von der See her nach dem letzten Sturm zu erwarten haben, iſt die heute eingetroffene Nachricht, daß das Packetſchiff „George Canning“, Hrn. R. M. Slomann')

In [20]:
displacy.render(doc2, style='ent', jupyter = True, options = displacy_options)

In [5]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'morphologizer',
 'parser',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [9]:
config = {"overwrite_ents": True }
ruler = nlp.add_pipe('entity_ruler', config=config)

In [10]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'morphologizer',
 'parser',
 'ner',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler']

In [13]:
natural_phenomena = ["Sturm", "Hagel"]
for n in natural_phenomena:
    ruler.add_patterns([{"label": "NAT", "pattern": n}])

In [15]:
doc3 = nlp('Heute haben wir einen starken Sturm un vielen Hagel gehabt.')

In [18]:
for ent in doc3.ents:
    print(ent.text, ent.label_)

Sturm NAT
Hagel NAT


In [24]:
from spellchecker import SpellChecker

In [62]:
spell = SpellChecker(language='de', case_sensitive=True)

In [27]:
with open('./test_data/TP_tests/DZ_Nr098_1900.txt', 'r', encoding='utf8') as f:
    text = f.read()
    text = text.replace('\n', ' ')

In [34]:
doc3 = nlp(text)

In [56]:
misspelled = []

for token in doc3:
    if spell.unknown([token.text]) != set():
        misspelled.append(token.text)
        

In [63]:
for word in misspelled:
    print(word, spell.candidates(word))

.Arensb {'karens'}
Wochenbl {'wochen'}
Segler {'segle'}
.Johannes {'johannes'}
Capitain {'captain'}
Zerelschen {'Zerelschen'}
Hafen {'haften', 'hagen', 'haken', 'halfen', 'haven', 'haufen', 'haben', 'hauen'}
ConrS {'connors', 'cents', 'cos', 'course', 'corn', 'doors', 'cars', 'ones', 'tongs', 'cora', 'vors', 'bones', 'yours', 'hours', 'longs', 'conrad', 'cops', 'contra', 'sons', 'mones', 'coles', 'conners', 'corners', 'comes', 'songs', 'jonas', 'colors', 'hors', 'fonds', 'con', 'tors'}
Windau {'linda', 'winde', 'windig', 'winds', 'wind', 'windet', 'winden', 'window'}
Bord {'bond', 'nord', 'word', 'lord', 'bird', 'born', 'borg', 'boyd', 'board', 'mord'}
fich {'fick', 'lich', 'nich', 'mich', 'rich', 'fisch', 'sich', 'ficht', 'eich', 'dich', 'finch', 'ich', 'fish', 'fach', 'wich'}
Capitain {'captain'}
Matrose {'marode', 'maßlose'}
nnd {'end', 'ind', 'ned', 'nd', 'nid', 'and', 'und'}
Lhserort {'Lhserort'}
Sturm {'storm', 'sture', 'turm', 'stur', 'stumm'}
Mannsanden {'Mannsanden'}
denTodtn 

In [48]:
asd == set()

False