In [1]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
# get single author's texts
with open("../txt/marx_engels/marx_engels_total.txt") as f:
    me_total = f.read()
with open("../txt/trotzki/trotzki_total.txt") as f:
    trotzki_total = f.read()
with open("../txt/stalin/stalin_total.txt") as f:
    stalin_total = f.read()

In [3]:
# get whole corpus
with open("../txt/all.txt") as f:
    all_total = f.read()

In [4]:
def prepare(text):
    sents = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sents]
    return words

In [6]:
# prepare single author's texts
me_prepared = prepare(me_total)
trotzki_prepared = prepare(trotzki_total)
stalin_prepared = prepare(stalin_total)

In [5]:
# prepare whole corpus
all_prepared = prepare(all_total)

In [13]:
# get reference corpus
with open("german-amazon-reviews.txt", encoding="iso-8859-15") as f:
    amazon = f.read()
amazon_prepared = prepare(amazon)

In [6]:
# build model for whole corpus
model_all = Word2Vec(all_prepared, size=300)

In [32]:
model_all.wv.most_similar(positive=["Liberalismus", "Proletariat"], negative=["Bourgeoisie"], topn=100)

[('Judentum', 0.5863721966743469),
 ('Triumph', 0.5830175876617432),
 ('Internationalismus', 0.5678070783615112),
 ('Evangelium', 0.5672754049301147),
 ('Terrorismus', 0.563199520111084),
 ('Auftreten', 0.5511847734451294),
 ('Fundament', 0.5495161414146423),
 ('Zektran', 0.5463460683822632),
 ('Kommunismus', 0.5422465205192566),
 ('Typ', 0.5408061146736145),
 ('Ausgang', 0.5359870195388794),
 ('Sowjetstaat', 0.5356222987174988),
 ('Demokratismus', 0.5353365540504456),
 ('Studium', 0.5315435528755188),
 ('Kräfteverhältnis', 0.5297054052352905),
 ('Pazifismus', 0.5295162200927734),
 ('Milieu', 0.5292493104934692),
 ('Umsturz', 0.5266759991645813),
 ('Trotzkismus', 0.5249322652816772),
 ('Wunsche', 0.5240627527236938),
 ('Chartismus', 0.5236687660217285),
 ('Bürokratismus', 0.5232359170913696),
 ('Parlamentarismus', 0.522665798664093),
 ('Rechtsopportunismus', 0.5224024653434753),
 ('Fortschreiten', 0.521385908126831),
 ('Pickelhäring', 0.5205150842666626),
 ('Egoismus', 0.52005732059478

In [41]:
# build models for single author's texts
print("me")
model_me = Word2Vec(me_prepared, size=300)
print("trotzki")
model_trotzki = Word2Vec(trotzki_prepared, size=300)
print("stalin")
model_stalin = Word2Vec(stalin_prepared, size=300)
print("amazon")
model_amazon = Word2Vec(amazon_prepared, size=300)

me
trotzki
stalin
amazon


In [57]:
# get lists of similar words for single author's texts
word = "Armee"
n = 10

sim_me = model_me.wv.similar_by_vector(word, topn=n)
print(sim_me)
sim_me = {elem[0]:i+1 for i, elem in enumerate(sim_me)}

sim_t = model_trotzki.wv.similar_by_vector(word, topn=n)
print(sim_t)
sim_t = {elem[0]:i+1 for i, elem in enumerate(sim_t)}

sim_s = model_stalin.wv.similar_by_vector(word, topn=n)
print(sim_s)
sim_s = {elem[0]:i+1 for i, elem in enumerate(sim_s)}

sim_a = model_amazon.wv.similar_by_vector(word, topn=n)
print(sim_a)
sim_a = {elem[0]:i+1 for i, elem in enumerate(sim_a)}

[('Flotte', 0.818668007850647), ('Truppen', 0.8157972097396851), ('Kavallerie', 0.7579588890075684), ('Artillerie', 0.7291944622993469), ('Position', 0.7245581150054932), ('Streitkräfte', 0.7153795957565308), ('Festung', 0.714713454246521), ('Garnison', 0.7123770117759705), ('Infanterie', 0.7063800096511841), ('Schlacht', 0.7007136940956116)]
[('Bürokratie', 0.9351111650466919), ('Sozialdemokratie', 0.9092366099357605), ('Bourgeoisie', 0.9002584218978882), ('Arbeiterklasse', 0.898350715637207), ('Intelligenz', 0.8885919451713562), ('Garnison', 0.8768906593322754), ('Bauernschaft', 0.8748764395713806), ('UdSSR', 0.8641899824142456), ('Bewegung', 0.8611583709716797), ('Wirtschaft', 0.8566991090774536)]
[('Heimat', 0.7084632515907288), ('Flotte', 0.6961424946784973), ('Truppen', 0.6792908906936646), ('Aufbauarbeit', 0.6564610600471497), ('Armeen', 0.6428441405296326), ('Front', 0.6264467239379883), ('Gewerkschaftsinternationale', 0.6186168193817139), ('Funktionäre', 0.6159325242042542), (

In [17]:
def compare_lists(l1, l2):
    d1 = {elem[0]: i+1 for i, elem in enumerate(l1)}
    d2 = {elem[0]: i+1 for i, elem in enumerate(l2)}
    s1 = set(d1)
    s2 = set(d2)

    res = 0
    for elem in s1.intersection(s2):
        res += (len(l1) - abs(d1[elem] - d2[elem])) / len(l1)**2
    return res

In [58]:
# print similarities between single authors
print("Trotzki:", compare_lists(sim_me, sim_t))
print("Stalin: ", compare_lists(sim_me, sim_s))
print("T-S   : ", compare_lists(sim_s, sim_t))
print("Amazon: ", compare_lists(sim_me, sim_a))

Trotzki: 0.26
Stalin:  0.44999999999999996
T-S   :  0.18
Amazon:  0.13
