## CLASSIFICATION: UNCLASSIFIED

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import random
from glob import glob
import logging
logging.getLogger().setLevel(logging.CRITICAL)

Using TensorFlow backend.


# Using doc2vec to embed your strings into a vector space

In [2]:
# Load unclassified data

fnames = glob('/mypath/Dictionary/*')

words = list()
for fname in fnames:
    f = open(fname, 'rb').readlines()
    for ln in f:
        ln = ln.strip()
        if len(ln) > 0:
            ln = ln.split()
            try:
                word = ln[0].lower().decode('utf8')
                words.append(word)
            except:
                pass

words = list(set(words))

In [3]:
# Sample 1000

words = random.sample(words, 1000)

In [4]:
words[0:10]

['circumgyration',
 'resourceless',
 'dynamism',
 'enbattled',
 'cross-tining',
 'conservant',
 'conglutinate',
 'individualizing',
 'tintinnabulary',
 'focuses']

In [5]:
# doc2vec requires docs to be a list of TaggedDocuments
# Our words will be single letter, and the document will be a word

# We also create a mapping between the word and ID

word2id = dict()
id2word = dict()
docs = list()

for i, word in enumerate(words):
    doc = [ch for ch in word]
    docs.append(TaggedDocument(doc, [i]))
    word2id[word] = i
    id2word[i] = word

In [6]:
# Looking at the docs

docs[0:10]

[TaggedDocument(words=['c', 'i', 'r', 'c', 'u', 'm', 'g', 'y', 'r', 'a', 't', 'i', 'o', 'n'], tags=[0]),
 TaggedDocument(words=['r', 'e', 's', 'o', 'u', 'r', 'c', 'e', 'l', 'e', 's', 's'], tags=[1]),
 TaggedDocument(words=['d', 'y', 'n', 'a', 'm', 'i', 's', 'm'], tags=[2]),
 TaggedDocument(words=['e', 'n', 'b', 'a', 't', 't', 'l', 'e', 'd'], tags=[3]),
 TaggedDocument(words=['c', 'r', 'o', 's', 's', '-', 't', 'i', 'n', 'i', 'n', 'g'], tags=[4]),
 TaggedDocument(words=['c', 'o', 'n', 's', 'e', 'r', 'v', 'a', 'n', 't'], tags=[5]),
 TaggedDocument(words=['c', 'o', 'n', 'g', 'l', 'u', 't', 'i', 'n', 'a', 't', 'e'], tags=[6]),
 TaggedDocument(words=['i', 'n', 'd', 'i', 'v', 'i', 'd', 'u', 'a', 'l', 'i', 'z', 'i', 'n', 'g'], tags=[7]),
 TaggedDocument(words=['t', 'i', 'n', 't', 'i', 'n', 'n', 'a', 'b', 'u', 'l', 'a', 'r', 'y'], tags=[8]),
 TaggedDocument(words=['f', 'o', 'c', 'u', 's', 'e', 's'], tags=[9])]

In [7]:
# Intialize the model

model=Doc2Vec(size=100, window=3, min_count=1, workers=1)  
model.build_vocab(docs) 

In [10]:
# Model vocabulary:

model.wv.vocab.keys()

dict_keys(['c', 'i', 'r', 'u', 'm', 'g', 'y', 'a', 't', 'o', 'n', 'e', 's', 'l', 'd', 'b', '-', 'v', 'z', 'f', 'k', 'h', 'x', 'w', 'j', 'p', "'", 'q', '/', '\\', '8', '9', '1'])

In [13]:
# Train

for epoch in range(5000):  
    if epoch%100==0:
        print(epoch, end=",")  
    random.shuffle(docs) # shuffling improves accuracy  
    model.train(docs,total_examples=model.corpus_count,epochs=model.iter)
# Save morel to disk:
model.save('my_doc2vec.model')

#Load model from disk:
loaded_model = Doc2vec.load('my_doc2vec.model')



0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,4800,4900,

NameError: name 'Doc2vec' is not defined

In [600]:
# Examine results

sampled_word = random.sample(word2id.keys(), 1)[0]
word_id = word2id[sampled_word]

print('->', sampled_word)
for i,dist in model.docvecs.most_similar(word_id):
    print(id2word[i], dist)

-> carnate
catenarian 0.8770938515663147
catenary 0.8148835897445679
cantrap 0.7034035325050354
errantia 0.679182767868042
aret 0.6783150434494019
recharter 0.6721259951591492
earthen 0.6616954803466797
ecrasement 0.6581054925918579
ramenta 0.6570524573326111
nictitate 0.6551100015640259


In [602]:
# Predict an unseen word

model.infer_vector([ch for ch in 'thishasnotbeenseen'])  

array([ 0.0252908 , -0.00184963,  0.01034822, -0.0725349 , -0.00036861,
       -0.05427065,  0.05543349,  0.04705929,  0.13685465, -0.05237833,
       -0.02433972, -0.1571064 , -0.04413479, -0.01799181,  0.05554555,
        0.02819999,  0.12372626,  0.00089496,  0.02137826,  0.02530239,
        0.04476873,  0.06453766,  0.13744074, -0.03069505, -0.01986267,
        0.01688043, -0.00165788, -0.03812186,  0.05609811,  0.01070709,
        0.0530517 , -0.05096143, -0.01210093, -0.02673788,  0.04528841,
       -0.10980565, -0.04488931,  0.02754822,  0.03489595,  0.01706621,
        0.00252509,  0.1012056 ,  0.00429634, -0.18010156, -0.0336314 ,
        0.05466712,  0.12280104,  0.11435528, -0.01919638, -0.00593638,
        0.07689813,  0.03652444, -0.0299679 , -0.08849475, -0.09305425,
       -0.07621914, -0.04179749,  0.11866371,  0.06045969, -0.0659438 ,
       -0.05988073,  0.05995956,  0.1161134 , -0.08249649, -0.01848164,
       -0.11880788, -0.02720426, -0.02820082,  0.02717019,  0.09

In [608]:
# Get the actual vector for a word:

model.docvecs.doctag_syn0[word2id['railing']]

array([  7.91290402e-02,  -2.15354100e-01,  -1.03427410e+00,
        -6.47459447e-01,   5.85198939e-01,  -6.40803218e-01,
         1.72562733e-01,   6.07906356e-02,   4.58596051e-01,
        -3.16858202e-01,  -5.66073740e-03,   6.82961822e-01,
        -5.71416795e-01,   1.23959756e+00,   2.74155408e-01,
         2.79298872e-01,  -8.95022631e-01,   2.47595951e-01,
         3.59667465e-02,  -1.85420945e-01,   1.50854373e+00,
        -4.36332017e-01,  -1.90419659e-01,  -4.27732825e-01,
         3.58692795e-01,   3.24834399e-02,  -3.35018784e-01,
         6.44691408e-01,  -6.48786962e-01,  -1.43860149e+00,
        -3.20353627e-01,   5.65001547e-01,  -9.71344292e-01,
        -1.03948092e+00,   4.56681848e-01,  -1.51573360e+00,
        -1.92424446e-01,  -2.48525694e-01,   2.82247037e-01,
        -4.21614408e-01,  -1.54514927e-02,  -7.26724982e-01,
        -1.30672359e+00,  -5.56172729e-01,  -5.76018989e-01,
         6.63460791e-02,  -1.08765316e+00,   7.51016364e-02,
        -6.02600873e-01,