In [1]:
!pip install gensim



You are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [3]:
from gensim.utils import simple_preprocess, simple_tokenize #text processing
from gensim.models import Word2Vec #prebuilt word to vec implementation
import glob #finds all pathnames matching a pattern, like regex
import codecs #unicode support when reading files
from multiprocessing import cpu_count #use to get number of cpus on host machine
from string import punctuation #string  containing all puncuation

In [3]:
book_filenames = sorted(glob.glob("data/*.txt"))
print("Found books:")
book_filenames

Found books:


['data\\stephen_hawking_a_brief_history_of_time.txt']

In [5]:
corpus_raw = u""
#for each book, read it, open it in utf 8 format, 
#add it to the raw corpus
for book_filename in book_filenames:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()

print("Corpus is {0} characters long".format(len(corpus_raw)))

Corpus is 387063 characters long


In [7]:
sentences = corpus_raw.split('\n') #split at new lines
sentences =  filter(None, sentences) # remove empty strings
sentences =  list(map(simple_preprocess,sentences)) #clean text 

In [87]:
workers=cpu_count()
workers

4

In [72]:
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=workers) #fit model

In [73]:
len(model.wv.vocab) #size of vocab

1387

In [74]:
model.wv.vectors.shape  # how we can access the word embeddings matrix

(1387, 100)

In [75]:
'space'  in model.wv.vocab #check if 'space' is in vocab

True

In [76]:
model.wv['man'] #get word vector for man

array([-0.01465496,  0.01855453,  0.0022719 , -0.06112096, -0.02817178,
        0.04477079,  0.04597754, -0.03425493, -0.0090222 , -0.04977283,
       -0.07500474,  0.0224941 , -0.02319601,  0.01396265, -0.01555628,
        0.01490307,  0.00677013,  0.06195471,  0.01085262, -0.0126013 ,
       -0.07233048,  0.09994438,  0.03814352, -0.04395657, -0.00201381,
        0.02499729, -0.06253047,  0.01077667, -0.13164003,  0.06797785,
       -0.02049701, -0.01650673, -0.01370123,  0.08436047,  0.05432375,
        0.07485294,  0.04716923, -0.02200927, -0.08238328, -0.08254569,
        0.02941178,  0.01398645,  0.0251173 ,  0.0441589 , -0.00354505,
       -0.03939682,  0.00875462,  0.03850073, -0.03537133,  0.05729111,
       -0.01879443,  0.01179119,  0.0062182 , -0.02885175, -0.02580679,
       -0.02727155,  0.01287902,  0.02287446, -0.04475835, -0.07670359,
       -0.07455707, -0.06913875,  0.01325182, -0.02264668,  0.08108921,
       -0.01915772, -0.07557271,  0.09637232, -0.00984656,  0.11

In [77]:
model.wv.most_similar('relativity')

[('quantum', 0.9994014501571655),
 ('general', 0.999215841293335),
 ('mechanics', 0.9992020130157471),
 ('according', 0.9990145564079285),
 ('principle', 0.9990142583847046),
 ('when', 0.9989587068557739),
 ('laws', 0.9989577531814575),
 ('means', 0.998950183391571),
 ('light', 0.9989355206489563),
 ('boundary', 0.99889075756073)]

In [78]:
model.wv.most_similar(positive=['space','escape'], negative=['me']) 

[('imaginary', 0.998525857925415),
 ('arrow', 0.9983617067337036),
 ('figure', 0.9967396855354309),
 ('didn', 0.9965739846229553),
 ('pulse', 0.9965206384658813),
 ('college', 0.9964756965637207),
 ('tube', 0.9964718222618103),
 ('creation', 0.9964587688446045),
 ('piccadilly', 0.9955571889877319),
 ('discussed', 0.9955341815948486)]

In [79]:
from sklearn.manifold import TSNE #from dimensionality reduction
import pandas as pd 

In [None]:

n = 1000
tsne = TSNE(n_components=2, perplexity=3,random_state=0)
tsne_vectors = tsne.fit_transform(model.wv.vectors[:n])

In [None]:
words = model.wv.index2word[:n]
    




In [92]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
output_notebook()


In [93]:
#create a dataframe to plot with
df = pd.DataFrame(tsne_vectors,index=words,columns=['x_coord','y_coord'])
df.index.name = 'word'
df.head()

Unnamed: 0_level_0,x_coord,y_coord
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,-36.033222,-51.23185
of,-57.165684,-25.074642
to,-19.744776,-81.412506
in,-11.125856,-51.300083
that,-31.442764,-81.315987


In [94]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(df)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

In [99]:
# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# plot!
show(tsne_plot);

