In [1]:
import spacy
import pandas as pd
import gensim

In [2]:
with open('dataset.txt', 'r', errors='ignore', encoding= 'unicode_escape') as f:
    data = f.read()
    # with split remove whitespaces and multiple spaces
    text = ' '.join(data.split())

In [3]:
# load the installed model "en_core_web_sm"
nlp = spacy.load("en_core_web_sm")

In [4]:
# processing text with the nlp object
doc = nlp(text)

In [5]:
# accessing token attributes
output = []
for token in doc:
    output.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_])

In [6]:
# represent attributes as dataframe
df = pd.DataFrame(output, columns=['Word', 'Lemma', 'POS', 'TAG', 'DEP'])

In [7]:
print(df.shape)

(139402, 5)


In [8]:
df[:10]

Unnamed: 0,Word,Lemma,POS,TAG,DEP
0,The,the,DET,DT,det
1,Fifth,Fifth,PROPN,NNP,compound
2,Elephant,Elephant,PROPN,NNP,compound
3,A,A,PROPN,NNP,compound
4,Discworld,Discworld,PROPN,NNP,compound
5,Novell,Novell,PROPN,NNP,ROOT
6,by,by,ADP,IN,prep
7,Terry,Terry,PROPN,NNP,compound
8,Pratchett,Pratchett,PROPN,NNP,pobj
9,They,they,PRON,PRP,nsubj


In [9]:
# drop duplicates and reindexing
df = df.drop_duplicates(ignore_index=True)

In [10]:
print(df.shape)

(20195, 5)


In [11]:
# store vocabulary
df.to_excel("output.xlsx")

In [12]:
# initial lemma list for word2vec
lemma_list = [token.lemma_ for token in doc]

In [13]:
# initialize and train model on lemma list
model = gensim.models.Word2Vec(
    [lemma_list],
    negative = 10, # negative sampling how many "noise words" should be drawn
    epochs = 100, # iter = 100,
    min_count = 1, # ignores all words with total frequency lower than this
    window = 7, # maximum distance between the current and predicted word
    vector_size = 40 # size = 40 # dimension of the word vector
    )

In [14]:
# show result of model work
print(model.wv['world'])

[ 1.2515882   0.8618007  -0.62189734  0.21660362  1.4357944  -0.9189028
  0.5014339   1.9640644   0.9241476   1.7720709   0.9949428  -2.8574426
 -0.41420013 -1.6200068   0.03121516  0.46067068  1.6498103  -1.5088545
  1.2730426   0.12057707  0.9383932  -0.8432872   2.2130914  -0.8925691
  1.8986632   1.070426   -0.5505121   0.9107701  -0.46397185  0.18015507
  1.8056365   0.54558706  2.3465574   0.41375023  0.09151997  1.8800701
  1.071501   -0.5528572  -0.5008159  -0.9496743 ]


In [15]:
print(model.wv.most_similar('world', topn=5))

[('build', 0.8675922751426697), ('gravitate', 0.8382787704467773), ('ever', 0.8053116202354431), ('gutte', 0.8022469282150269), ('folkway', 0.798398494720459)]


In [16]:
print(model.wv.similarity('world', 'road'))

0.7462981


In [17]:
print(model.wv.most_similar_cosmul(positive = ['world','road'], negative = ['sea']))

[('surround', 1.8114854097366333), ('stealthy', 1.7997974157333374), ('bank', 1.7719866037368774), ('gravitate', 1.7695605754852295), ('local', 1.7693874835968018), ('since', 1.73635995388031), ('ever', 1.7125179767608643), ('pass', 1.6898698806762695), ('scan', 1.6774638891220093), ('fortune', 1.6696643829345703)]
