In [1]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
# get single author's texts
with open("../txt/marx_engels/marx_engels_total.txt") as f:
    me_total = f.read()
with open("../txt/trotzki/trotzki_total.txt") as f:
    trotzki_total = f.read()
with open("../txt/stalin/stalin_total.txt") as f:
    stalin_total = f.read()

In [2]:
# get whole corpus
with open("../txt/all.txt") as f:
    all_total = f.read()

In [3]:
def prepare(text):
    sents = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sents]
    return words

In [6]:
# prepare single author's texts
me_prepared = prepare(me_total)
trotzki_prepared = prepare(trotzki_total)
stalin_prepared = prepare(stalin_total)

In [4]:
# prepare whole corpus
all_prepared = prepare(all_total)

In [13]:
# get reference corpus
with open("german-amazon-reviews.txt", encoding="iso-8859-15") as f:
    amazon = f.read()
amazon_prepared = prepare(amazon)

In [5]:
# build model for whole corpus
model_all = Word2Vec(all_prepared, size=300)

In [7]:
# get analogies
model_all.wv.most_similar(positive=["Junge", "Frau"], negative=["Mann"], topn=100)

[('Perrier', 0.6235110759735107),
 ('Stuart', 0.6121819019317627),
 ('Staatsprokurator', 0.6116944551467896),
 ('gläubige', 0.608957052230835),
 ('Krummacher', 0.6083165407180786),
 ('Thompson', 0.6060768365859985),
 ('Madame', 0.6056742072105408),
 ('Fould', 0.6023613810539246),
 ('Clément', 0.6016688942909241),
 ('Tochter', 0.597574770450592),
 ('Advokat', 0.5965981483459473),
 ('Weerth', 0.5941691398620605),
 ('Gagern', 0.5936335921287537),
 ('Aristokrat', 0.5928776264190674),
 ('Schimmelpfennig', 0.5920757055282593),
 ('Demokrat', 0.5877822041511536),
 ('Student', 0.5867450833320618),
 ('Cabet', 0.5863481163978577),
 ('Sasonow', 0.5862720608711243),
 ('Haller', 0.5827111005783081),
 ('älteste', 0.5825262069702148),
 ('Howell', 0.5817673206329346),
 ('stoische', 0.5815796256065369),
 ('Inspektor', 0.5808447599411011),
 ('Paléologue', 0.5802479386329651),
 ('ehrwürdige', 0.58016037940979),
 ('Thomas', 0.57985919713974),
 ('Odilon', 0.5797971487045288),
 ('Dudley', 0.5792602300643921)

In [41]:
# build models for single author's texts
print("me")
model_me = Word2Vec(me_prepared, size=300)
print("trotzki")
model_trotzki = Word2Vec(trotzki_prepared, size=300)
print("stalin")
model_stalin = Word2Vec(stalin_prepared, size=300)
print("amazon")
model_amazon = Word2Vec(amazon_prepared, size=300)

me
trotzki
stalin
amazon


In [57]:
# get lists of similar words for single author's texts
word = "Armee"
n = 10

sim_me = model_me.wv.similar_by_vector(word, topn=n)
print(sim_me)
sim_me = {elem[0]:i+1 for i, elem in enumerate(sim_me)}

sim_t = model_trotzki.wv.similar_by_vector(word, topn=n)
print(sim_t)
sim_t = {elem[0]:i+1 for i, elem in enumerate(sim_t)}

sim_s = model_stalin.wv.similar_by_vector(word, topn=n)
print(sim_s)
sim_s = {elem[0]:i+1 for i, elem in enumerate(sim_s)}

sim_a = model_amazon.wv.similar_by_vector(word, topn=n)
print(sim_a)
sim_a = {elem[0]:i+1 for i, elem in enumerate(sim_a)}

[('Flotte', 0.818668007850647), ('Truppen', 0.8157972097396851), ('Kavallerie', 0.7579588890075684), ('Artillerie', 0.7291944622993469), ('Position', 0.7245581150054932), ('Streitkräfte', 0.7153795957565308), ('Festung', 0.714713454246521), ('Garnison', 0.7123770117759705), ('Infanterie', 0.7063800096511841), ('Schlacht', 0.7007136940956116)]
[('Bürokratie', 0.9351111650466919), ('Sozialdemokratie', 0.9092366099357605), ('Bourgeoisie', 0.9002584218978882), ('Arbeiterklasse', 0.898350715637207), ('Intelligenz', 0.8885919451713562), ('Garnison', 0.8768906593322754), ('Bauernschaft', 0.8748764395713806), ('UdSSR', 0.8641899824142456), ('Bewegung', 0.8611583709716797), ('Wirtschaft', 0.8566991090774536)]
[('Heimat', 0.7084632515907288), ('Flotte', 0.6961424946784973), ('Truppen', 0.6792908906936646), ('Aufbauarbeit', 0.6564610600471497), ('Armeen', 0.6428441405296326), ('Front', 0.6264467239379883), ('Gewerkschaftsinternationale', 0.6186168193817139), ('Funktionäre', 0.6159325242042542), (

In [17]:
def compare_lists(l1, l2):
    d1 = {elem[0]: i+1 for i, elem in enumerate(l1)}
    d2 = {elem[0]: i+1 for i, elem in enumerate(l2)}
    s1 = set(d1)
    s2 = set(d2)

    res = 0
    for elem in s1.intersection(s2):
        res += (len(l1) - abs(d1[elem] - d2[elem])) / len(l1)**2
    return res

In [58]:
# print similarities between single authors
print("Trotzki:", compare_lists(sim_me, sim_t))
print("Stalin: ", compare_lists(sim_me, sim_s))
print("T-S   : ", compare_lists(sim_s, sim_t))
print("Amazon: ", compare_lists(sim_me, sim_a))

Trotzki: 0.26
Stalin:  0.44999999999999996
T-S   :  0.18
Amazon:  0.13
