In [1]:
import gensim
from gensim import corpora
from pprint import pprint

In [2]:
# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [3]:
texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)
#print(dictionary.token2id)
dictionary.doc2bow('Preparing report'.split())

[(6, 1)]

In [4]:
from gensim.utils import simple_preprocess

In [5]:
my_docs = ["Who let  dogs out?",
           "Who? Who? Who? Who?"]

tokenized_list = [simple_preprocess(doc) for doc in my_docs]
mycorpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)

[[(42, 1), (60, 1), (61, 1), (62, 1)], [(62, 4)]]


In [6]:
from gensim import models

In [7]:
tfidf = models.TfidfModel(mycorpus, smartirs='ntc')

In [8]:
import gensim.downloader as api

In [None]:
data = api.load("text8")
dataset = [wd for wd in data]

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

print(bigram[dataset[0]])

In [None]:
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
print(trigram[bigram[dataset[0]]])

In [None]:
!pip install git+git://github.com/pattern3/pattern.git

In [20]:
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
from nltk.corpus import stopwords
import re
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'also','edu', 'subject', 'lines', 'organization', 'would', 'article', 'could']

In [21]:
data = api.load("text8")
dataset = [wd for wd in data]
data_processed = []
for i, doc in enumerate(dataset[:50]):
    doc_out = []
    for wd in doc:
        if wd not in stop_words:  # remove stopwords
            lemmatized_word = lemmatize(wd, allowed_tags=re.compile('(NN|JJ|RB)'))  # lemmatize
            if lemmatized_word:
                doc_out = doc_out + [lemmatized_word[0].split(b'/')[0].decode('utf-8')]
        else:
            continue
    data_processed.append(doc_out)

# Print a small sample    
print(data_processed[0][:5]) 

['anarchism', 'originated', 'term', 'abuse', 'first']


In [22]:
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

2021-02-10 12:33:49,437 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-10 12:33:49,682 : INFO : built Dictionary(24520 unique tokens: ['ability', 'able', 'abnormal', 'abolition', 'absence']...) from 50 documents (total 215171 corpus positions)


In [23]:
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

2021-02-10 12:33:52,427 : INFO : using asymmetric alpha [0.26219156, 0.19027454, 0.14931786, 0.12287004, 0.104381524, 0.090729296, 0.080235206]
2021-02-10 12:33:52,428 : INFO : using symmetric eta at 0.14285714285714285
2021-02-10 12:33:52,434 : INFO : using serial LDA version on this node
2021-02-10 12:33:52,454 : INFO : running online LDA training, 7 topics, 10 passes over the supplied corpus of 50 documents, updating every 11000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2021-02-10 12:33:52,455 : INFO : training LDA model using 11 processes
2021-02-10 12:33:52,675 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #50/50, outstanding queue size 1
2021-02-10 12:33:54,560 : INFO : topic #6 (0.080): 0.001*"first" + 0.001*"many" + 0.001*"god" + 0.001*"state" + 0.000*"world" + 0.000*"apollo" + 0.000*"person" + 0.000*"time" + 0.000*"year" + 0.000*"new"
2021-02-10 12:33:54,562 : INFO : topic #5 (0.091): 0.001*"many" + 0.0

2021-02-10 12:33:55,989 : INFO : topic #6 (0.080): 0.006*"atheism" + 0.005*"apollo" + 0.004*"god" + 0.004*"atheist" + 0.003*"agassi" + 0.003*"open" + 0.002*"moon" + 0.002*"crew" + 0.002*"final" + 0.002*"lunar"
2021-02-10 12:33:55,990 : INFO : topic #5 (0.091): 0.001*"many" + 0.001*"state" + 0.001*"first" + 0.001*"acid" + 0.000*"time" + 0.000*"person" + 0.000*"year" + 0.000*"anchorage" + 0.000*"american" + 0.000*"alaska"
2021-02-10 12:33:55,991 : INFO : topic #2 (0.149): 0.003*"atom" + 0.002*"electron" + 0.002*"arsenic" + 0.002*"element" + 0.002*"antimony" + 0.001*"atomic" + 0.001*"compound" + 0.001*"isotope" + 0.001*"neutron" + 0.001*"argon"
2021-02-10 12:33:55,993 : INFO : topic #1 (0.190): 0.005*"football" + 0.005*"player" + 0.004*"ball" + 0.004*"american" + 0.004*"play" + 0.003*"line" + 0.003*"team" + 0.003*"british" + 0.002*"war" + 0.002*"offensive"
2021-02-10 12:33:55,994 : INFO : topic #0 (0.262): 0.000*"state" + 0.000*"world" + 0.000*"first" + 0.000*"many" + 0.000*"person" + 0.0

[(0,
  '0.000*"state" + 0.000*"world" + 0.000*"first" + 0.000*"many" + 0.000*"person" + 0.000*"war" + 0.000*"year" + 0.000*"atheism" + 0.000*"american" + 0.000*"time"'),
 (1,
  '0.007*"football" + 0.006*"player" + 0.006*"ball" + 0.006*"play" + 0.006*"american" + 0.005*"line" + 0.004*"team" + 0.004*"british" + 0.003*"war" + 0.003*"back"'),
 (2,
  '0.006*"atom" + 0.003*"electron" + 0.003*"arsenic" + 0.003*"element" + 0.002*"antimony" + 0.002*"atomic" + 0.002*"compound" + 0.001*"isotope" + 0.001*"neutron" + 0.001*"nucleus"'),
 (3,
  '0.005*"state" + 0.004*"many" + 0.004*"first" + 0.003*"person" + 0.003*"time" + 0.003*"world" + 0.003*"year" + 0.003*"new" + 0.003*"war" + 0.003*"agave"'),
 (4,
  '0.008*"audi" + 0.005*"car" + 0.005*"engine" + 0.003*"vehicle" + 0.002*"automobile" + 0.002*"aircraft" + 0.002*"wheel" + 0.001*"first" + 0.001*"drive" + 0.001*"quattro"'),
 (5,
  '0.001*"many" + 0.001*"state" + 0.000*"first" + 0.000*"acid" + 0.000*"time" + 0.000*"person" + 0.000*"year" + 0.000*"ancho

In [24]:
for c in lda_model[corpus[5:8]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print("------------------------------------------------------\n")

Document Topics      :  [(3, 0.9997827)]
Word id, Topics      :  [(0, [3]), (7, [3]), (10, [3])]
Phi Values (word id) :  [(0, [(3, 2.9987354)]), (7, [(3, 0.9869535)])]
Word, Topics         :  [('ability', [3]), ('absurdity', [3])]
Phi Values (word)    :  [('ability', [(3, 2.9987354)]), ('absurdity', [(3, 0.9869535)])]
------------------------------------------------------

Document Topics      :  [(3, 0.99978393)]
Word id, Topics      :  [(0, [3]), (10, [3]), (16, [3])]
Phi Values (word id) :  [(0, [(3, 5.997471)]), (10, [(3, 2.9968653)])]
Word, Topics         :  [('ability', [3]), ('academic', [3])]
Phi Values (word)    :  [('ability', [(3, 5.997471)]), ('academic', [(3, 2.9968653)])]
------------------------------------------------------

Document Topics      :  [(3, 0.9998095)]
Word id, Topics      :  [(1, [3]), (10, [3]), (15, [3])]
Phi Values (word id) :  [(1, [(3, 0.9996602)]), (10, [(3, 5.9937315)])]
Word, Topics         :  [('able', [3]), ('academic', [3])]
Phi Values (word)   

In [25]:
from gensim.models import LsiModel

# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)

# View Topics
pprint(lsi_model.print_topics(-1))

2021-02-10 12:35:53,581 : INFO : using serial LSI version on this node
2021-02-10 12:35:53,582 : INFO : updating model with new documents
2021-02-10 12:35:53,583 : INFO : preparing a new chunk of documents
2021-02-10 12:35:53,603 : INFO : using 100 extra samples and 2 power iterations
2021-02-10 12:35:53,604 : INFO : 1st phase: constructing (24520, 107) action matrix
2021-02-10 12:35:53,628 : INFO : orthonormalizing (24520, 107) action matrix
2021-02-10 12:35:54,211 : INFO : 2nd phase: running dense svd on (107, 50) matrix
2021-02-10 12:35:54,228 : INFO : computing the final decomposition
2021-02-10 12:35:54,230 : INFO : keeping 7 factors (discarding 55.737% of energy spectrum)
2021-02-10 12:35:54,242 : INFO : processed documents up to #50
2021-02-10 12:35:54,245 : INFO : topic #0(713.655): 0.289*"agave" + 0.219*"state" + 0.159*"many" + 0.154*"first" + 0.136*"person" + 0.135*"time" + 0.132*"year" + 0.131*"war" + 0.128*"world" + 0.124*"american"
2021-02-10 12:35:54,247 : INFO : topic #1

[(0,
  '0.289*"agave" + 0.219*"state" + 0.159*"many" + 0.154*"first" + '
  '0.136*"person" + 0.135*"time" + 0.132*"year" + 0.131*"war" + 0.128*"world" '
  '+ 0.124*"american"'),
 (1,
  '-0.898*"agave" + -0.152*"asia" + -0.096*"aruba" + 0.069*"first" + '
  '0.062*"time" + 0.061*"many" + 0.060*"person" + -0.054*"plant" + '
  '-0.051*"var" + 0.050*"war"'),
 (2,
  '0.296*"football" + 0.252*"player" + 0.251*"american" + 0.228*"ball" + '
  '0.220*"war" + 0.209*"play" + 0.177*"line" + 0.172*"team" + 0.162*"lincoln" '
  '+ -0.160*"atheism"'),
 (3,
  '0.546*"lincoln" + 0.194*"state" + -0.187*"football" + 0.181*"aristotle" + '
  '-0.181*"player" + 0.177*"union" + -0.140*"ball" + 0.140*"war" + '
  '-0.119*"play" + 0.115*"achille"'),
 (4,
  '-0.534*"atheism" + -0.373*"god" + -0.316*"atheist" + -0.176*"lincoln" + '
  '-0.154*"belief" + 0.127*"africa" + -0.122*"existence" + -0.117*"religion" + '
  '-0.102*"deity" + 0.102*"hiv"'),
 (5,
  '0.286*"lincoln" + -0.222*"aluminium" + -0.206*"island" + 0.194

In [26]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(data_part1, min_count = 0, workers=cpu_count())

# Get the word vector for given word
model['topic']
#> array([ 0.0512,  0.2555,  0.9393, ... ,-0.5669,  0.6737], dtype=float32)

model.most_similar('topic')
# Save and Load Model
model.save('newmodel')
model = Word2Vec.load('newmodel')

2021-02-10 12:37:16,942 : INFO : collecting all words and their counts
2021-02-10 12:37:16,943 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-10 12:37:18,713 : INFO : collected 189074 word types from a corpus of 10000000 raw words and 1000 sentences
2021-02-10 12:37:18,714 : INFO : Loading a fresh vocabulary
2021-02-10 12:37:20,889 : INFO : effective_min_count=0 retains 189074 unique words (100% of original 189074, drops 0)
2021-02-10 12:37:20,890 : INFO : effective_min_count=0 leaves 10000000 word corpus (100% of original 10000000, drops 0)
2021-02-10 12:37:21,300 : INFO : deleting the raw counts dictionary of 189074 items
2021-02-10 12:37:21,305 : INFO : sample=0.001 downsamples 38 most-common words
2021-02-10 12:37:21,306 : INFO : downsampling leaves estimated 7563517 word corpus (75.6% of prior 10000000)
2021-02-10 12:37:21,743 : INFO : estimated required memory for 189074 words and 100 dimensions: 245796200 bytes
2021-02-10 12:37:21,743 : INFO :

2021-02-10 12:38:14,135 : INFO : EPOCH 5 - PROGRESS: at 87.60% examples, 1629222 words/s, in_qsize 18, out_qsize 5
2021-02-10 12:38:14,659 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-02-10 12:38:14,660 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-02-10 12:38:14,662 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-02-10 12:38:14,664 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-02-10 12:38:14,667 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-10 12:38:14,669 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-10 12:38:14,681 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-10 12:38:14,682 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-10 12:38:14,684 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-10 12:38:14,685 : INFO : worker thread finished; awaitin

In [27]:
# Update the model with new data.
model.build_vocab(data_part2, update=True)
model.train(data_part2, total_examples=model.corpus_count, epochs=model.iter)
model['topic']

2021-02-10 12:51:34,879 : INFO : collecting all words and their counts
2021-02-10 12:51:34,880 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-10 12:51:36,117 : INFO : collected 153347 word types from a corpus of 7005207 raw words and 701 sentences
2021-02-10 12:51:36,118 : INFO : Updating model with new vocabulary
2021-02-10 12:51:36,339 : INFO : New added 153347 unique words (50% of original 306694) and increased the count of 153347 pre-existing words (50% of original 306694)
2021-02-10 12:51:37,082 : INFO : deleting the raw counts dictionary of 153347 items
2021-02-10 12:51:37,085 : INFO : sample=0.001 downsamples 72 most-common words
2021-02-10 12:51:37,086 : INFO : downsampling leaves estimated 10509051 word corpus (150.0% of prior 7005207)
2021-02-10 12:51:37,446 : INFO : estimated required memory for 306694 words and 100 dimensions: 398702200 bytes
2021-02-10 12:51:37,447 : INFO : updating layer weights
  model.train(data_part2, total_examples=

2021-02-10 12:52:03,724 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-02-10 12:52:03,726 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-02-10 12:52:03,729 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-10 12:52:03,731 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-10 12:52:03,732 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-10 12:52:03,733 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-10 12:52:03,739 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-10 12:52:03,744 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-10 12:52:03,745 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-10 12:52:03,747 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-10 12:52:03,748 : INFO : EPOCH - 5 : training on 7005207 raw words (5253850 effect

array([ 0.10153081,  0.1926781 ,  0.32744864, -0.01678169, -0.24205594,
       -1.7215012 ,  1.5836352 , -0.13802871, -1.0151654 , -0.67926466,
        0.99595296,  0.1391699 ,  0.33435762,  3.0574293 , -0.1753422 ,
        1.6304696 ,  0.7603253 ,  0.3986856 , -1.5905765 ,  2.3999097 ,
       -0.26273668,  1.4957714 , -0.3528074 ,  1.1807108 ,  0.99626297,
       -1.2842723 , -1.7719195 , -0.43413588, -0.34358102, -1.4486936 ,
       -1.2305747 ,  0.42596608,  0.27028072,  1.1097169 , -0.6121068 ,
        0.2724385 ,  0.41296592,  0.74607074, -0.97312564, -1.4279096 ,
       -0.40815052,  0.28389895,  0.23084855,  1.4811028 , -0.08197882,
        1.3253828 ,  0.6860957 ,  0.5596175 ,  0.645516  , -0.04176502,
        0.0863863 , -1.2047492 , -0.37585747,  0.52356535, -0.13275096,
       -1.0186473 ,  0.03767383,  1.7454988 , -0.79560745,  0.21469817,
        0.8225354 , -0.46741393,  0.70313364,  0.42461437, -0.22032373,
        0.06962129, -0.33938822,  0.3546789 ,  0.16525197,  0.43

In [28]:
import gensim.downloader as api

# Download the models
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')
word2vec_model300 = api.load('word2vec-google-news-300')
glove_model300 = api.load('glove-wiki-gigaword-300')

# Get word embeddings
word2vec_model300.most_similar('support')



2021-02-10 12:54:25,308 : INFO : fasttext-wiki-news-subwords-300 downloaded
2021-02-10 12:54:25,315 : INFO : loading projection weights from /Users/imayak/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz
2021-02-10 12:57:06,471 : INFO : loaded (999999, 300) matrix from /Users/imayak/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz
2021-02-10 12:57:06,536 : INFO : loading projection weights from /Users/imayak/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2021-02-10 12:57:55,235 : INFO : loaded (3000000, 300) matrix from /Users/imayak/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz




2021-02-10 12:58:49,839 : INFO : glove-wiki-gigaword-300 downloaded
2021-02-10 12:58:49,843 : INFO : loading projection weights from /Users/imayak/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2021-02-10 12:59:54,888 : INFO : loaded (400000, 300) matrix from /Users/imayak/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2021-02-10 12:59:54,893 : INFO : precomputing L2-norms of word weight vectors


[('supporting', 0.6251285076141357),
 ('suport', 0.6071150302886963),
 ('suppport', 0.6053199768066406),
 ('Support', 0.6044273376464844),
 ('supported', 0.6009396910667419),
 ('backing', 0.6007589101791382),
 ('supports', 0.5269277095794678),
 ('assistance', 0.5207138061523438),
 ('sup_port', 0.5192490220069885),
 ('supportive', 0.5110025405883789)]

In [32]:
word2vec_model300.most_similar(['wireless', 'services'])

[('wireless_broadband', 0.6713308095932007),
 ('telecommunications', 0.6429787874221802),
 ('NetInformer_leading', 0.6375865936279297),
 ('GigaBeam_revolutionary', 0.6346628665924072),
 ('Airband_fixed', 0.6342990398406982),
 ('ISDN_ADSL', 0.6333105564117432),
 ('servicesand', 0.629921555519104),
 ('Apprion_delivers', 0.628883957862854),
 ('Aruba_wireless_LAN', 0.6278460025787354),
 ('IP_Roamer', 0.6219089031219482)]

In [35]:
from gensim.matutils import softcossim
from gensim import corpora

sent_1 = 'It always snows during winter and some are worse'.split()
sent_2 = 'When it snows during spring, it surprises many'.split()
sent_3 = 'During sports season, snow and bad weather does not bother '.split()

# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

# Prepare a dictionary and a corpus.
documents = [sent_1, sent_2, sent_3]
dictionary = corpora.Dictionary(documents)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(sent_1)
sent_2 = dictionary.doc2bow(sent_2)
sent_3 = dictionary.doc2bow(sent_3)

# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))

print(softcossim(sent_1, sent_3, similarity_matrix))

print(softcossim(sent_2, sent_3, similarity_matrix))


  similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)
2021-02-10 15:37:51,293 : INFO : constructing a sparse term similarity matrix using <gensim.models.keyedvectors.WordEmbeddingSimilarityIndex object at 0x7ff2e7c782e0>
2021-02-10 15:37:51,294 : INFO : iterating over columns in dictionary order
2021-02-10 15:37:51,300 : INFO : PROGRESS: at 4.35% columns (1 / 23, 4.347826% density, 4.347826% projected density)
2021-02-10 15:37:52,865 : INFO : constructed a sparse term similarity matrix with 9.640832% density
2021-02-10 15:37:52,867 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-10 15:37:52,867 : INFO : built Dictionary(23 unique tokens: ['It', 'always', 'and', 'are', 'during']...) from 3 documents (total 27 corpus positions)


0.3679225482343939
0.3408726649549052
0.12996676575362087


  print(softcossim(sent_1, sent_2, similarity_matrix))
  print(softcossim(sent_1, sent_3, similarity_matrix))
  print(softcossim(sent_2, sent_3, similarity_matrix))


In [36]:
# Which word from the given list doesn't go with the others?
print(fasttext_model300.doesnt_match(['india', 'australia', 'pakistan', 'china', 'beetroot']))

# Compute cosine distance between two words.
print(fasttext_model300.distance('king', 'queen'))

# Compute cosine distances from given word or vector to all words in `other_words`.
print(fasttext_model300.distances('king', ['queen', 'man', 'woman']))

# Compute cosine similarities
print(fasttext_model300.cosine_similarities(fasttext_model300['king'], 
                                            vectors_all=(fasttext_model300['queen'], 
                                                        fasttext_model300['man'], 
                                                        fasttext_model300['woman'],
                                                        fasttext_model300['queen'] + fasttext_model300['man'])))  
# Get the words closer to w1 than w2
print(glove_model300.words_closer_than(w1='king', w2='kingdom'))

# Find the top-N most similar words.
print(fasttext_model300.most_similar(positive='king', negative=None, topn=5, restrict_vocab=None, indexer=None))
# Find the top-N most similar words, using the multiplicative combination objective,

print(glove_model300.most_similar_cosmul(positive='king', negative=None, topn=5))

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


beetroot
0.22957539558410645
[0.22957546 0.465837   0.547001  ]
[0.77042454 0.534163   0.45299897 0.76572543]


2021-02-10 15:41:11,155 : INFO : precomputing L2-norms of word weight vectors


['prince', 'queen', 'monarch']
[('king-', 0.7838029265403748), ('boy-king', 0.7704817652702332), ('queen', 0.7704246044158936), ('prince', 0.7700966596603394), ('kings', 0.7668929696083069)]
[('queen', 0.8168227076530457), ('prince', 0.809830367565155), ('monarch', 0.7949802875518799), ('kingdom', 0.7895625829696655), ('throne', 0.7803236842155457)]
