In [1]:
import spacy
from spacy import displacy
import pandas as pd
import numpy as np

In [2]:
# lisab NERi tõhjale 'de' mudelile, kuhu on juba cmd-ga vektorid lisatud

# nlp = spacy.load('../data/models/main/')
# nlp.add_pipe('ner')
# nlp.to_disk('../data/models/main/')

In [70]:
nlp = spacy.load('../data/models/main_2/model-best/')

In [7]:
nlp.pipeline

[('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1fd8fc58fa0>)]

In [59]:
de_core_news = spacy.load('de_core_news_md')

In [9]:
df = pd.read_parquet('../data/processed/RZ_sample.parquet')

In [65]:
displacy_color_code = {'WEA': '#4cafd9',
                  'PER': '#ffb366',
                  'DAT': '#bf80ff',
                  'LOC': '#a88676'}

displacy_options = {'ents': ['WEA', 'PER', 'DAT', 'LOC'], 'colors': displacy_color_code}

def example():
    
    wea = 0
    
    while wea == 0:
        i = np.random.randint(len(df))
        text = df.full_text.iloc[i]
        year = df.year.iloc[i]
        doc = nlp(text)
        if 'WEA' in set([ent.label_ for ent in doc.ents]):
            wea = 1
        
    print(year, '\n')
    displacy.render(doc, style='ent', jupyter=True, options=displacy_options)

In [81]:
example()

1830 



In [46]:
import gensim
import pandas as pd
import json
import itertools

In [3]:
model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('../data/models/word2vec_251021/w2v_model.txt')

In [14]:
def create_similarity_df(words, model):
    data = {}
    for word in words:
        similars = model.most_similar(word, topn=10)
        data[word] = [word for (word, num) in similars]
        data[word+' sim'] = [round(num, 3) for (word, num) in similars]
        
    return pd.DataFrame(data)

In [27]:
print(create_similarity_df(['Amerika', 'Krieg', 'schnell', 'nehmen'], model).to_latex())

\begin{tabular}{llrlrlrlr}
\toprule
{} &       Amerika &  Amerika sim &              Krieg &  Krieg sim &    schnell &  schnell sim &     nehmen &  nehmen sim \\
\midrule
0 &   Nordamerika &        0.845 &             Kriege &      0.839 &      rasch &        0.938 &     nebmen &       0.929 \\
1 &    Australien &        0.779 &            Krieges &      0.740 &    langsam &        0.758 &     uehmen &       0.907 \\
2 &    Südamerika &        0.777 &       Bürgerkriege &      0.735 &  schneller &        0.749 &   genommen &       0.893 \\
3 &        Canada &        0.755 &        Bürgerkrieg &      0.721 &    rascher &        0.743 &     nähmen &       0.882 \\
4 &   Kalifornien &        0.743 &      Angriffskrieg &      0.710 &    alsbald &        0.658 &      nehme &       0.871 \\
5 &  Nord-Amerika &        0.731 &  Vernichtungskrieg &      0.675 &      eilig &        0.655 &  nehmenden &       0.852 \\
6 &       England &        0.730 &            Feldzug &      0.671 &   sogleich

In [29]:
with open('../pipeline/weather_events.json', 'r', encoding='utf8') as f:
    events = json.load(f)

In [44]:
for cat, words in events.items():
    events[cat] = [word for word in words if model.has_index_for(word)]

In [53]:
len(list(itertools.chain.from_iterable(events.values())))

62

In [43]:
model.has_index_for('Kältewelle')

False

In [81]:
ensemble = pd.DataFrame.from_dict(events, orient='index').transpose().fillna('')
ensemble = ensemble[['precipitations', 'wind', 'flood', 'temperature', 'temperature', 'general']]

In [96]:
clean_events = [
    'Regen',
    'Regenfall',
    'Niederschlag',
    'Regenschauer',
    'Wolkenbruch',
    'Regenguss',
    'Schnee',
    'Schneefall',
    'Dürre',
    'Hagel',
    'Hagelschlag',
    'Gewitter',
    'Nebel',
    'Wind',
    'Sturm',
    'Sturmwind',
    'Tornado',
    'Orkan',
    'Wirbelwind',
    'Wirbelsturm',
    'Unwetter',
    'Taifun',
    'Südwind',
    'Südwestwind',
    'Westwind',
    'Nordwestwind',
    'Nordwind',
    'Nordostwind',
    'Ostwind',
    'Südostwind',
    'Flut',
    'Hochwasser',
    'Überschwemmung',
    'Thau',
    'Thauwetter',
    'Kälte',
    'Wärme',
    'Hitze',
    'Frost',
    'Wetter',
    'Witterung',
    'Klima',
    'Windstärke',
    'Luftdruck']

clean_events_dict ={'col1': clean_events[:11],
                   'col2': clean_events[11:22],
                   'col3': clean_events[22:33],
                   'col4': clean_events[33:]}

print(pd.DataFrame.from_dict(clean_events_dict, orient='index').transpose().fillna('').to_latex(index=False))

\begin{tabular}{llll}
\toprule
        col1 &        col2 &           col3 &       col4 \\
\midrule
       Regen &    Gewitter &        Südwind &       Thau \\
   Regenfall &       Nebel &    Südwestwind & Thauwetter \\
Niederschlag &        Wind &       Westwind &      Kälte \\
Regenschauer &       Sturm &   Nordwestwind &      Wärme \\
 Wolkenbruch &   Sturmwind &       Nordwind &      Hitze \\
   Regenguss &     Tornado &    Nordostwind &      Frost \\
      Schnee &       Orkan &        Ostwind &     Wetter \\
  Schneefall &  Wirbelwind &     Südostwind &  Witterung \\
       Dürre & Wirbelsturm &           Flut &      Klima \\
       Hagel &    Unwetter &     Hochwasser & Windstärke \\
 Hagelschlag &      Taifun & Überschwemmung &  Luftdruck \\
\bottomrule
\end{tabular}



In [95]:
ensemble.to_latex()

44

In [6]:
[word for (word, num) in similars]

['Warme',
 'wärmer',
 'Temperatnr',
 'ausstrahlende',
 'Wirme',
 'Gefrierpunkt',
 'Wärm',
 'Kälte',
 'kälter',
 'Sonnenwärme',
 'Reaumur',
 'wärmere',
 'Kühle',
 'Ausstrahlung',
 'Temperatur',
 'intensiv',
 'erwärmte',
 'erwärmt',
 'wärme',
 'intensiver',
 'Fahrenheit',
 'kälteren',
 'Külte',
 'feuchter',
 'Thermometers',
 'erwärmende',
 'kühlerer',
 'Rêaumur',
 'eisiger',
 'Innigkeit']

In [82]:
from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL

In [84]:
spancat = DEFAULT_SPANCAT_MODEL

In [86]:
spancat.

dict_items([('@architectures', 'spacy.SpanCategorizer.v1'), ('scorer', {'@layers': 'spacy.LinearLogistic.v1'}), ('reducer', {'@layers': 'spacy.mean_max_reducer.v1', 'hidden_size': 128}), ('tok2vec', {'@architectures': 'spacy.Tok2Vec.v1', 'embed': {'@architectures': 'spacy.MultiHashEmbed.v1', 'width': 96, 'rows': [5000, 2000, 1000, 1000], 'attrs': ['ORTH', 'PREFIX', 'SUFFIX', 'SHAPE'], 'include_static_vectors': False}, 'encode': {'@architectures': 'spacy.MaxoutWindowEncoder.v1', 'width': 96, 'window_size': 1, 'maxout_pieces': 3, 'depth': 4}})])