In [None]:

from gensim.utils import simple_preprocess, simple_tokenize #text processing
from gensim.models import Word2Vec #prebuilt word to vec implementation
import glob #finds all pathnames matching a pattern, like regex
import codecs #unicode support when reading files
from multiprocessing import cpu_count #use to get number of cpus on host machine

In [3]:
book_filenames = sorted(glob.glob("data/GOT/*.txt"))
print("Found books:")
book_filenames

Found books:


['data/GOT\\data_got1.txt',
 'data/GOT\\data_got2.txt',
 'data/GOT\\data_got3.txt',
 'data/GOT\\data_got4.txt',
 'data/GOT\\data_got5.txt']

In [4]:
corpus_raw = u""
#for each book, read it, open it un utf 8 format, 
#add it to the raw corpus
for book_filename in book_filenames:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()

print("Corpus is {0} characters long".format(len(corpus_raw)))

Corpus is 9719485 characters long


In [6]:
sentences = corpus_raw.split('\n') #split at new lines
sentences =  filter(None, sentences) # remove empty strings
sentences =  list(map(simple_preprocess,sentences)) #clean text 

In [11]:
workers=cpu_count()
model = Word2Vec(sentences, size=300, window=5, min_count=5, workers=workers) #create word2vec model

In [13]:
len(model.wv.vocab) #size of vocab

11766

In [15]:
model.wv.vectors.shape  # how we can access the word embeddings matrix

(11766, 300)

In [22]:
'snow' in model.wv.vocab #check if 'space' is in vocab

True

In [23]:
model.wv['snow'] #get word vector for man

array([ 3.71322662e-01,  5.87369502e-01,  5.76531708e-01, -2.94173092e-01,
        2.49198228e-01,  7.75276050e-02, -2.05042094e-01,  4.02523607e-01,
        1.24497890e-01, -1.21049654e+00, -7.41286814e-01,  1.81025475e-01,
        7.47166753e-01,  4.92221504e-01, -2.92573065e-01, -4.59385604e-01,
       -6.87930048e-01,  3.99636149e-01, -9.76110637e-01, -7.80443311e-01,
        1.69991106e-02,  7.95198977e-01, -7.58907944e-02, -1.21287286e-01,
       -3.20774466e-01,  5.59173711e-02,  1.41246212e+00,  4.20744032e-01,
       -1.48440465e-01, -1.55406147e-02, -6.03617728e-01, -1.22083855e+00,
        4.57578719e-01,  3.47050846e-01, -1.05461374e-01, -7.84770191e-01,
       -6.84181213e-01, -3.36129457e-01, -6.37376249e-01,  1.18244238e-01,
        2.32397825e-01, -3.89529198e-01,  3.19525987e-01,  1.38509357e+00,
        4.33366627e-01, -6.55730903e-01,  6.70348048e-01,  1.06394029e+00,
        2.76381016e-01,  5.19534834e-02,  5.61959326e-01,  1.03538513e+00,
        3.54425609e-02,  

In [27]:
model.wv.most_similar('mother')

[('sister', 0.8614943027496338),
 ('aunt', 0.7734062671661377),
 ('husband', 0.7714309692382812),
 ('wife', 0.7531055808067322),
 ('daughter', 0.7242030501365662),
 ('father', 0.7141742706298828),
 ('brother', 0.7111056447029114),
 ('daughters', 0.6845688223838806),
 ('sisters', 0.6797002553939819),
 ('bride', 0.6733927726745605)]

In [31]:
model.wv.most_similar(positive=['king','woman'], negative=['man']) 

[('queen', 0.6956346035003662),
 ('prince', 0.6224972009658813),
 ('princess', 0.5522409677505493),
 ('targaryen', 0.5088884830474854),
 ('imp', 0.5050137042999268),
 ('mother', 0.5020624399185181),
 ('daenerys', 0.4996476173400879),
 ('myrcella', 0.49528080224990845),
 ('stark', 0.49465084075927734),
 ('elia', 0.48885583877563477)]

In [32]:
from sklearn.manifold import TSNE #from dimensionality reduction
import pandas as pd 

In [34]:
n = 3000
tsne = TSNE(n_components=2, perplexity=3,random_state=0)
tsne_vectors = tsne.fit_transform(model.wv.vectors[:n])

In [36]:
words = model.wv.index2word[:n]

In [20]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
output_notebook()

# cells below this should be copied again in local host

In [None]:
#create a dataframe to plot with
df = pd.DataFrame(tsne_vectors,index=words,columns=['x_coord','y_coord'])
df.index.name = 'word'
df.head()

In [None]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(df)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

In [None]:
# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# plot!
show(tsne_plot);