In [1]:
from nltk import TreebankWordTokenizer
import gensim
from gensim.models import Word2Vec 
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#word2vec needs list of lists, so I'm going to open 2 files

with open("/home/josh/Documents/a-library-nlp-project/theanarchistlibrary.org/library/zvonimir-kontrec-architecture-is-a-political-act.html", "r") as file:
    html = file.read()
soup = BeautifulSoup(html, "html.parser")
with open("/home/josh/Documents/a-library-nlp-project/theanarchistlibrary.org/library/zundlumpen-will-o-the-wisps.html", "r") as file:
    html2 = file.read()
soup2 = BeautifulSoup(html2, "html.parser")

In [3]:
#convert html to text
first_small_doc = soup.get_text()
second_small_doc = soup2.get_text()

In [4]:
#first_small_doc
#print(first_small_doc)
#print(second_small_doc)

#calling it by itself shows a bunch of line breaks and formatting junk
#printing it makes it all nice and pretty and readable

In [5]:
docs = []
docs.append(first_small_doc)
docs.append(second_small_doc)
len(docs)

2

In [6]:
#tokenize
tokenizer = TreebankWordTokenizer()
docs_tokenized = []
for doc in docs:
    docs_tokenized.append(tokenizer.tokenize(doc))


In [7]:
print(len(docs_tokenized))

2


In [8]:
#build skipgram model
model_sg = Word2Vec(sentences=docs_tokenized, window=5, min_count=3, workers=4, epochs=5, sg=1)

In [9]:
model_sg.wv.most_similar('the', topn=10)

[('for', 0.9980694651603699),
 ('this', 0.9979365468025208),
 ('In', 0.9979010820388794),
 ('system', 0.9978348016738892),
 ('way', 0.9978195428848267),
 ('in', 0.9977800846099854),
 ('their', 0.9977703094482422),
 ('towards', 0.9977662563323975),
 ('building', 0.9977622032165527),
 ('after', 0.9977437257766724)]

In [10]:
#build CBOW model
model_cbow = Word2Vec(sentences=docs_tokenized, window=5, min_count=3, workers=4, epochs=5, sg=0)

In [11]:
model_cbow.wv.most_similar('media', topn=10)

[('through', 0.9726642370223999),
 ('into', 0.9719277024269104),
 ("n't", 0.9718241691589355),
 ('was', 0.9717423915863037),
 ('also', 0.9716292023658752),
 (')', 0.9716008305549622),
 ('it', 0.971388041973114),
 ('I', 0.9713448286056519),
 ('(', 0.9712851047515869),
 ('is', 0.9712598919868469)]

In [12]:
#ok, now let's do it for a few more docs and compare cosine cimilarity between two models?

In [13]:

with open("/home/josh/Documents/a-library-nlp-project/theanarchistlibrary.org/library/zundlumpen-radical-left-i-m-breaking-up-with-you.html", "r") as file:
    html3 = file.read()
soup3 = BeautifulSoup(html3, "html.parser")
with open("/home/josh/Documents/a-library-nlp-project/theanarchistlibrary.org/library/zosia-brom-insert-topic-of-the-day-has-divided-anarchists.html", "r") as file:
    html4 = file.read()
soup4 = BeautifulSoup(html4, "html.parser")

#convert html to text
third_small_doc = soup3.get_text()
fourth_small_doc = soup4.get_text()

#this could be decade folders when i build this out
docs_next_corpus = []
docs_next_corpus.append(third_small_doc)
docs_next_corpus.append(fourth_small_doc)
len(docs_next_corpus)

docs_next_tokenized = []
for doc in docs_next_corpus:
    docs_next_tokenized.append(tokenizer.tokenize(doc))

model_sg_next = Word2Vec(sentences=docs_next_tokenized, window=5, min_count=3, workers=4, epochs=5, sg=1)
model_sg_next.wv.most_similar('the', topn=10)

[('and', 0.9982710480690002),
 ('ideas', 0.9982483386993408),
 ('left', 0.9982380867004395),
 ('many', 0.9981945157051086),
 ('by', 0.9981881380081177),
 ('to', 0.9981729984283447),
 (',', 0.9981165528297424),
 ('as', 0.998098611831665),
 ('on', 0.9980598092079163),
 ('radical', 0.9980372786521912)]

In [14]:
#finding a word that is in both doc lists to see top 10 most similar words
model_sg_next.wv.most_similar('political', topn=10)

[('the', 0.9978370070457458),
 ('many', 0.9977409243583679),
 ('a', 0.997643232345581),
 ('that', 0.9976242780685425),
 ('but', 0.9976202845573425),
 ('we', 0.997607409954071),
 ('or', 0.9975824356079102),
 ('by', 0.9975618124008179),
 ('communist', 0.9975566864013672),
 ('from', 0.9975111484527588)]

In [15]:
model_sg.wv.most_similar('political', topn=10)

[('backwards', 0.9973574280738831),
 ('anti-civilizational', 0.9973565936088562),
 ('thought', 0.9972497224807739),
 ('were', 0.9972323179244995),
 ('life', 0.9972214698791504),
 ('In', 0.9971413016319275),
 ('Instead', 0.997123122215271),
 ('within', 0.9971001148223877),
 ('so', 0.9970480799674988),
 ('it', 0.9970154166221619)]

In [16]:
#save the vectors for the same words in different corpora
vector_pol_1 = model_sg.wv['political']
vector_pol_2 = model_sg_next.wv['political']

vector_the_1 = model_sg.wv['the']
vector_the_2 = model_sg_next.wv['the']

In [17]:
#check structure of vector
#vector_pol_1

In [18]:
def reshape(vector):
    res = vector.reshape(1, -1)
    return res

In [19]:
vector_pol_1 = reshape(vector_pol_1)
vector_pol_2 = reshape(vector_pol_2)

In [20]:
cosine_similarity(vector_pol_1, vector_pol_2)

array([[0.7454266]], dtype=float32)

In [21]:
vector_the_1 = reshape(vector_the_1)
vector_the_2 = reshape(vector_the_2)

In [22]:
cosine_similarity(vector_the_1, vector_the_2)

array([[0.7461838]], dtype=float32)

In [23]:
cosine_similarity(vector_the_1, vector_pol_1)

array([[0.9966311]], dtype=float32)

In [24]:
model_sg.wv.similarity("the", "political")

0.99663097