In [None]:
!pip install gensim --user

In [1]:
from gensim.test.utils import common_texts
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_model
import numpy as np

## Build a customed text model

In [2]:
print(len(common_texts))

9


In [3]:
print(common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [4]:
model = FastText(size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train

In [5]:
'computer' in model.wv.vocab

True

In [6]:
model.wv['computer']

array([-0.01146877,  0.05369632, -0.04157733, -0.0052736 ], dtype=float32)

## Load corpus file to build model

In [7]:
corpus_file = datapath('lee_background.cor')  # absolute path to corpus

# min_count: The model ignores all words with total frequency lower than this.
model3 = FastText(size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=10)

In [8]:
total_words

59890

In [9]:
'computer' in model.wv.vocab

True

In [10]:
model3.wv['computer']

array([-4.582946  ,  0.75927514,  0.455737  ,  2.340686  ], dtype=float32)

In [11]:
'computation' in model3.wv.vocab

False

In [12]:
model3.wv['computation']

array([-4.622708 ,  0.9891189,  0.6320012,  2.1230586], dtype=float32)

In [13]:
'dlink' in model3.wv.vocab

False

In [14]:
model3.wv['dlink']

array([-1.2702152 ,  0.15909258,  0.15788607,  0.7609659 ], dtype=float32)

## Persistent the model

In [None]:
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("fasttext-lee_background.model")
fname

In [15]:
fname_local = './fasttext-lee_background.model'
model3.save(fname_local)

## Load Model

In [16]:
model = FastText.load(fname_local)

In [17]:
import numpy as np
np.allclose(model.wv['computer'], model3.wv['computer'])

True

## Add new words and continue training the loaded model

In [18]:
'computation' in model.wv.vocab

False

In [19]:
model.wv['computation']

array([-4.622708 ,  0.9891189,  0.6320012,  2.1230586], dtype=float32)

In [20]:
old_vector = np.copy(model.wv['computation'])  # Grab the existing vector

In [21]:
new_sentences = [
    ['computer', 'aided', 'design'],
    ['computer', 'science'],
    ['computational', 'complexity'],
    ['military', 'supercomputer'],
    ['central', 'processing', 'unit'],
    ['onboard', 'car', 'computer'],
]

In [22]:
model.build_vocab(new_sentences, update=True)  # Update the vocabulary
model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)

In [23]:
model.wv['computation']

array([-4.6225486 ,  0.98880535,  0.63142854,  2.1224759 ], dtype=float32)

In [24]:
new_vector = model.wv['computation']

In [25]:
# Vector has changed, model has learnt something
np.allclose(old_vector, new_vector, atol=1e-4)

False

In [26]:
'computation' in model.wv.vocab

False

## Load Facebooks' fastText pre-trained model

In [27]:
from gensim.test.utils import datapath
cap_path = datapath("crime-and-punishment.bin")
cap_path

'/home/ec2-user/.local/lib/python3.6/site-packages/gensim/test/test_data/crime-and-punishment.bin'

In [28]:
from gensim.models.fasttext import load_facebook_model
fb_model = load_facebook_model(cap_path)

In [29]:
'computer' in fb_model.wv.vocab

False

In [30]:
old_computer = np.copy(fb_model.wv['computer'])
fb_model.build_vocab(new_sentences, update=True)
fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=10)

In [31]:
'computer' in fb_model.wv.vocab

True

In [32]:
new_computer = fb_model.wv['computer']
np.allclose(old_computer, new_computer, atol=1e-4)

False

In [33]:
len(fb_model.wv.vocab)

304

In [34]:
## Based from lee_background.cor
len(model3.wv.vocab)

10781

In [35]:
## retrained the model and add some new setences
len(model.wv.vocab)

10786

## Graph out of vocab

In [36]:
existent_word = "computer"
existent_word in model.wv.vocab

True

In [37]:
computer_vec = model.wv[existent_word]

In [38]:
oov_word = "graph-out-of-vocab"
oov_word in model.wv.vocab

False

In [39]:
oov_vec = model.wv[oov_word]  # numpy vector for OOV word

## Most Similarities words

In [40]:
similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])
most_similar = similarities[0]
most_similar

('tissue.', 0.9999960660934448)

In [41]:
similarities

[('tissue.', 0.9999960660934448),
 ('one,', 0.9999943971633911),
 ('unclear,', 0.9999920725822449),
 ('recklessly.', 0.9999920725822449),
 ('Presse', 0.9999920129776001),
 ('Rehman', 0.9999886751174927),
 ('power,', 0.9999876618385315),
 ('Policy', 0.9999870657920837),
 ('"Lunchtime', 0.9999870657920837),
 ('Police', 0.9999861717224121)]

In [42]:
similarities = model.wv.most_similar_cosmul(positive=['computer', 'human'], negative=['interface'])
most_similar = similarities[0]
most_similar

('sailing', 1.2896205186843872)

In [43]:
similarities

[('sailing', 1.2896205186843872),
 ('awards.', 1.2896173000335693),
 ('Waugh', 1.289616346359253),
 ('Arthurs,', 1.289613962173462),
 ('offer.', 1.289613127708435),
 ('issue.', 1.2896121740341187),
 ('traffic', 1.289609670639038),
 ('true...oh', 1.2896082401275635),
 ('evening,"', 1.289608120918274),
 ('ambushes,', 1.2896075248718262)]

In [45]:
not_matching = model.wv.doesnt_match("human computer interface tree".split())
not_matching

'interface'

In [46]:
sim_score = model.wv.similarity('computer', 'human')
sim_score

0.99938756

In [49]:
similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
similarities

((-0.0896179647302275, 0.2706179449895375),
 SpearmanrResult(correlation=0.01189933689403796, pvalue=0.8839334003571628),
 56.657223796033996)

In [50]:
analogies_result = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
analogies_result

(0.0006565988181221273,
 [{'section': 'capital-common-countries',
   'correct': [],
   'incorrect': [('BEIJING', 'CHINA', 'BERLIN', 'GERMANY'),
    ('BEIJING', 'CHINA', 'CANBERRA', 'AUSTRALIA'),
    ('BEIJING', 'CHINA', 'ISLAMABAD', 'PAKISTAN'),
    ('BEIJING', 'CHINA', 'KABUL', 'AFGHANISTAN'),
    ('BEIJING', 'CHINA', 'LONDON', 'ENGLAND'),
    ('BEIJING', 'CHINA', 'PARIS', 'FRANCE'),
    ('BERLIN', 'GERMANY', 'CANBERRA', 'AUSTRALIA'),
    ('BERLIN', 'GERMANY', 'ISLAMABAD', 'PAKISTAN'),
    ('BERLIN', 'GERMANY', 'KABUL', 'AFGHANISTAN'),
    ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'),
    ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'),
    ('BERLIN', 'GERMANY', 'BEIJING', 'CHINA'),
    ('CANBERRA', 'AUSTRALIA', 'ISLAMABAD', 'PAKISTAN'),
    ('CANBERRA', 'AUSTRALIA', 'KABUL', 'AFGHANISTAN'),
    ('CANBERRA', 'AUSTRALIA', 'LONDON', 'ENGLAND'),
    ('CANBERRA', 'AUSTRALIA', 'PARIS', 'FRANCE'),
    ('CANBERRA', 'AUSTRALIA', 'BEIJING', 'CHINA'),
    ('CANBERRA', 'AUSTRALIA', 'BERLIN', 'GERMANY'),
