In [1]:
import gensim, logging, os  
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  
  
import nltk  
from nltk.corpus import brown, movie_reviews, treebank
corpus = brown.sents()

In [2]:
fname = 'brown_skipgram.model'  
if os.path.exists(fname):  
    # load the file if it has already been trained, to save repeating the slow training step below  
    model = gensim.models.Word2Vec.load(fname)  
else:  
    # can take a few minutes, grab a cuppa  
    model = gensim.models.Word2Vec(corpus, size=100, min_count=5, workers=2, iter=50)   
    model.save(fname)  

2018-06-21 10:14:06,729 : INFO : loading Word2Vec object from brown_skipgram.model
2018-06-21 10:14:06,876 : INFO : loading wv recursively from brown_skipgram.model.wv.* with mmap=None
2018-06-21 10:14:06,877 : INFO : setting ignored attribute vectors_norm to None
2018-06-21 10:14:06,878 : INFO : loading vocabulary recursively from brown_skipgram.model.vocabulary.* with mmap=None
2018-06-21 10:14:06,878 : INFO : loading trainables recursively from brown_skipgram.model.trainables.* with mmap=None
2018-06-21 10:14:06,879 : INFO : setting ignored attribute cum_table to None
2018-06-21 10:14:06,880 : INFO : loaded brown_skipgram.model


In [15]:
words = "horse barn church fence tree building background hotel in is color grey behind".split()  
for w1 in words:  
    for w2 in words:  
        print(w1, w2, model.wv.similarity(w1, w2)) 

horse horse 1.0
horse barn 0.32499087867130183
horse church 0.19455762502706284
horse fence 0.3485812494105681
horse tree 0.25745871781519
horse building 0.05488356081858706
horse background -0.0021641229785151553
horse hotel 0.22747970321724675
horse in -0.05262146685371784
horse is -0.25475306300475514
horse color 0.029466336324977034
horse grey 0.07331464688233767
horse behind 0.22414768959252807
barn horse 0.32499087867130183
barn barn 0.9999999999999999
barn church 0.12043223081598908
barn fence 0.5768948773238054
barn tree 0.388676357787483
barn building 0.2882358892220665
barn background -0.03608052423298633
barn hotel 0.30350682032633913
barn in -0.12016775020099527
barn is -0.17665657087635822
barn color 0.05147154772942686
barn grey 0.25764990408288957
barn behind 0.14193280911250136
church horse 0.19455762502706284
church barn 0.12043223081598908
church church 1.0
church fence 0.03428438688864162
church tree 0.05256823662859336
church building 0.27783003584400784
church back

In [16]:
model.wv.similarity("is", "color")

7.71031881237165e-05

## Another example for training Word2Vec model:

In [5]:
from gensim.models import Word2Vec
#from gensim.corpora import WikiCorpus
import nltk
nltk.download('abc')
nltk.download('punkt')
from nltk.corpus import abc
b = Word2Vec(abc.sents())
# mr = Word2Vec(movie_reviews.sents())
# t = Word2Vec(treebank.sents())

2018-06-21 10:14:07,110 : INFO : collecting all words and their counts


[nltk_data] Downloading package abc to /Users/huilyu/nltk_data...
[nltk_data]   Package abc is already up-to-date!
[nltk_data] Downloading package punkt to /Users/huilyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2018-06-21 10:14:07,121 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-21 10:14:07,834 : INFO : PROGRESS: at sentence #10000, processed 266283 words, keeping 15327 word types
2018-06-21 10:14:08,489 : INFO : PROGRESS: at sentence #20000, processed 530624 words, keeping 25441 word types
2018-06-21 10:14:09,066 : INFO : collected 31885 word types from a corpus of 766811 raw words and 29059 sentences
2018-06-21 10:14:09,067 : INFO : Loading a fresh vocabulary
2018-06-21 10:14:09,097 : INFO : min_count=5 retains 10363 unique words (32% of original 31885, drops 21522)
2018-06-21 10:14:09,098 : INFO : min_count=5 leaves 730000 word corpus (95% of original 766811, drops 36811)
2018-06-21 10:14:09,128 : INFO : deleting the raw counts dictionary of 31885 items
2018-06-21 10:14:09,130 : INFO : sample=0.001 downsamples 43 most-common words
2018-06-21 10:14:09,131 : INFO : downsampling leaves estimated 540590 word corpus (74.1% of prior 730000)
2018-06-21 10:14:

In [6]:
b.most_similar('money', topn=5)

  """Entry point for launching an IPython kernel.
2018-06-21 10:14:19,587 : INFO : precomputing L2-norms of word weight vectors


[('information', 0.9287192821502686),
 ('put', 0.9198907613754272),
 ('difficult', 0.9064333438873291),
 ('us', 0.8973759412765503),
 ('themselves', 0.892667293548584)]

In [7]:
b.most_similar('apple', topn=20)

  """Entry point for launching an IPython kernel.


[('Cattle', 0.9784817099571228),
 ('pear', 0.9743441343307495),
 ('chicken', 0.9670129418373108),
 ('fishermen', 0.962110161781311),
 ('Kangaroo', 0.9618431925773621),
 ('Table', 0.9606196284294128),
 ('Quarantine', 0.9605086445808411),
 ('banana', 0.9605034589767456),
 ('Desert', 0.9591423869132996),
 ('Eastern', 0.9587815999984741),
 ('Eight', 0.9577393531799316),
 ('Sheep', 0.9569904208183289),
 ('cherry', 0.9564056396484375),
 ('Grain', 0.9560607075691223),
 ('beekeepers', 0.9554308652877808),
 ('representatives', 0.9554122686386108),
 ('lamb', 0.9552083015441895),
 ('Fruit', 0.9542449712753296),
 ('NT', 0.9540547132492065),
 ('City', 0.953415036201477)]

In [8]:
b.most_similar('strawberry', topn=20)

  """Entry point for launching an IPython kernel.


[('ionosphere', 0.9625688195228577),
 ('Tokyo', 0.9618353247642517),
 ('Shakespeare', 0.9605059623718262),
 ('1983', 0.9519456624984741),
 ('heating', 0.9518120288848877),
 ('youth', 0.9511633515357971),
 ('rodeo', 0.9509605169296265),
 ('core', 0.9501490592956543),
 ('Greek', 0.9499202966690063),
 ('copper', 0.9494541883468628),
 ('bulls', 0.9493855237960815),
 ('Medal', 0.9492664337158203),
 ('Brunini', 0.9466891884803772),
 ('September', 0.9463322758674622),
 ('voters', 0.9454492926597595),
 ('mineral', 0.9454271793365479),
 ('workplace', 0.945389986038208),
 ('dried', 0.9452536702156067),
 ('feedlot', 0.9450489282608032),
 ('political', 0.9440691471099854)]