## Dictionary

In [None]:
import gensim
from gensim import corpora
from pprint import pprint

In [None]:
doc = ['Forgiveness means letting go of the hope for a better past.',
       'Good things come to those who wait; better things come to those who work for it.',
       'At the end of the day, it\'s all about the person you want to own a dog with.',
       'If you are irritated by every rub, how will your mirror be polished?']

doc2 = ['Modern science explicitly and emphatically rejects teleology.',
        'Too bad all the people who know how to run the country are busy driving taxi cabs and cutting hair.',
       'Language is a system of conventional signs that can be voluntarily produced at any time.',
       'History is nothing whatever but a record of what living persons have done in the past.']

In [None]:
texts = [[text for text in sentence.split()] for sentence in doc]
texts2 = [[text for text in sentence.split()] for sentence in doc2]
dictionary = corpora.Dictionary(texts)

In [None]:
print(dictionary)

In [None]:
print(dictionary.token2id)

In [None]:
dictionary.add_documents(texts2)

In [None]:
print(dictionary)

In [None]:
print(dictionary.token2id)

## Bag of Words

In [None]:
from gensim.utils import simple_preprocess

doc = ['Rose is a rose is a rose',
       'How much wood would a woodchuck chuck if a woodchuck could chuck wood?',
       'She sells seashells by the seashore',
       'You know New York, you need New York, you know you need unique New York']
tokens = [simple_preprocess(sentence) for sentence in doc]

mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update = True) for doc in tokens]

In [None]:
print(mycorpus)

In [None]:
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus] 

In [None]:
pprint(word_counts)

In [None]:
mydict.save('mydict.dict')

In [None]:
mydict = corpora.Dictionary.load('mydict.dict')

## TF-IDF

In [None]:
from gensim import models
import numpy as np
doc = ['Forgiveness means letting go of the hope for a better past.',
       'Good things come to those who wait; better things come to those who work for it.',
       'At the end of the day, it\'s all about the person you want to own a dog with.',
       'If you are irritated by every rub, how will your mirror be polished?']

doc2 = ['Modern science explicitly and emphatically rejects teleology.',
        'Too bad all the people who know how to run the country are busy driving taxi cabs and cutting hair.',
       'Language is a system of conventional signs that can be voluntarily produced at any time.',
       'History is nothing whatever but a record of what living persons have done in the past.']
document = [' '.join(doc), ' '.join(doc2)]
mydict = corpora.Dictionary([simple_preprocess(line) for line in document])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in document]

In [None]:
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

In [None]:
tfidf = models.TfidfModel(corpus, smartirs = 'ntc')

In [None]:
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals = 2)] for id, freq in doc])

## Topic Models

In [None]:
import gensim.downloader as api
from gensim.models import LdaModel, LdaMulticore
from gensim.utils import simple_preprocess, lemmatize
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'subject', 'lines', 'organization', 'would', 'article', 'could']
dataset = api.load("text8")
data = [d for d in dataset]

In [None]:
data_processed = []

for i, doc in enumerate(data[:100]):
    doc_out = []
    for word in doc:
        if word not in stop_words:
            lemma = lemmatize(word, allowed_tags = re.compile('(NN|JJ|RB)'))
            if lemma:
                doc_out += [lemma[0].split(b'/')[0].decode('utf-8')]
        else:
            continue
    data_processed.append(doc_out)

In [None]:
data_processed[0][:10]

In [None]:
dictionary = corpora.Dictionary(data_processed)
corpus = [dictionary.doc2bow(line) for line in data_processed]

In [None]:
#Latent Dirichlet Allocation 
lda_model = LdaMulticore(corpus = corpus, 
                        id2word=dictionary,
                        random_state = 100,
                        num_topics = 7,
                        passes = 10,
                        chunksize = 100,
                        batch = False,
                        alpha = 'asymmetric',
                        decay = 0.5,
                        offset = 64,
                        eta = None,
                        eval_every = 0,
                        iterations = 100,
                        gamma_threshold = 0.001,
                        per_word_topics = True)

#lda_model.save('lda_model.model')

In [None]:
lda_model.print_topics(-1)

In [None]:
for c in lda_model[corpus[10:13]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dictionary[word], topic) for word, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dictionary[word], topic) for word, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print()

In [None]:
#Latent Semantic Analysis
from gensim.models import LsiModel
lsi_model = LsiModel(corpus = corpus, id2word = dictionary, num_topics = 7, decay = 0.5)

In [None]:
print(lsi_model.print_topics(-1))

In [None]:
lsi_model[corpus[10]]

## Word2Vec

In [None]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

dataset = api.load('text8')
data = [d for d in dataset]

data_pt1 = data[:1000]
data_pt2 = data[1000:]

model_word2vec = Word2Vec(data_pt1, min_count = 0, workers = cpu_count())

#model_word2vec.save('model_word2vec')
#model_word2vec = Word2Vec.load('model_word2vec')

In [None]:
model_word2vec['science'] #size = 100

In [None]:
model_word2vec.most_similar('science')

In [None]:
model_word2vec.build_vocab(data_pt2, update = True) #Update an existing Word2Vec model
model_word2vec.train(data_pt2, total_examples = model.corpus_count, epochs = model.iter)

In [None]:
model_word2vec['science']

In [None]:
model_word2vec.most_similar('science')

In [None]:
##Pre-trained models
#fasttext = api.load('fasttext-wiki-news-subwords-300')
#word2vec = api.load('word2vec-google-news-300')
#glove = api.load('glove-wiki-gigaword-300')

In [None]:
print(fasttext.doesnt_match(['india', 'germany', 'china', 'beer']))
print(fasttext.distance('king','queen'))
print(fasttext.distances('king', ['queen', 'man', 'woman']))

In [None]:
print(fasttext.cosine_similarities(model['king'],
                               vectors_all = (model['queen'], model['man'], model['woman'], 
                                             model['queen'] + model['man'])))

In [None]:
print(fasttext.words_closer_than('king', 'queen')) #to word 1 than to word 2

In [None]:
print(fasttext.most_similar(positive = 'king', negative = None, topn = 5,
                                 restrict_vocab = None, indexer = None)

In [None]:
#multiplicative combination object
print(fasttext.most_similar_cosmul(positive = 'king', negative = None, topn = 5))

## Doc2Vec

In [None]:
import gensim
import gensim.downloader as api
dataset = api.load('text8')
data = [d for d in dataset]

In [None]:
def tag_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i]) 
        #use yield when we want to iterate over a sequence, but don’t want to store the entire sequence in memory
train_data = list(tag_document(data)) 

In [None]:
train_data[-1]

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size = 50, min_count = 2, epochs = 40)
model.build_vocab(train_data)
model.train(train_data, total_examples = model.corpus_count, epochs = model.epochs)

In [None]:
#Pass a sentence as a list of words
model.infer_vector('German chancellor to appear in the G20 Summit'.split())

In [None]:
from gensim.matutils import softcossim
from gensim import corpora

sent1 = 'The chancellor of Germany is a chemist.'.split()
sent2 = 'Angela Merkel is a chemist.'.split()
sent3 = 'She is a chemist.'.split()
#similarity_matrix = fasttext.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)
documents = [sent1, sent2, sent3]
dictionary = corpora.Dictionary(documents)
sent1 = dictionary.doc2bow(sent1)
sent2 = dictionary.doc2bow(sent2)
sent3 = dictionary.doc2bow(sent3)

In [None]:
print(softcossim(sent1, sent2, similarity_matrix))
print(softcossim(sent1, sent3, similarity_matrix))
print(softcossim(sent2, sent3, similarity_matrix))

## Summarization

In [None]:
from gensim.summarization import summarize, keywords

text = 'German is a West Germanic language that is mainly spoken in Central Europe. It is the most widely spoken and official or co-official language in Germany, Austria, Switzerland, South Tyrol in Italy, the German-speaking Community of Belgium and Liechtenstein. It is one of the three official languages of Luxembourg and a co-official language in the Opole Voivodeship in Poland. The languages that are most similar to German are the other members of the West Germanic language branch, including Afrikaans, Dutch, English, the Frisian languages, Low German/Low Saxon, Luxembourgish, and Yiddish. There are strong similarities in vocabulary with Danish, Norwegian and Swedish, although those belong to the North Germanic group. German is the second most widely spoken Germanic language, after English. One of the major languages of the world, German is a native language to almost 100 million people worldwide and the most widely spoken native language in the European Union. German is the third most commonly spoken foreign language in the EU after English and French, making it the second biggest language in the EU in terms of overall speakers. German is also the second most widely taught foreign language in the EU after English at primary school level (but third after English and French at lower secondary level), the fourth most widely taught non-English language in the US (after Spanish, French and American Sign Language), and the second most commonly used scientific language as well as the third most widely used language on websites after English and Russian. The German-speaking countries are ranked fifth in terms of annual publication of new books, with one tenth of all books (including e-books) in the world being published in the German language. In the United Kingdom, German and French are the most sought-after foreign languages for businesses (with 49% and 50% of businesses identifying these two languages as the most useful, respectively).'

print(text + '\n')
print(summarize(text, split = True, word_count = 50, ratio = 0.4)) 
#split: a list of strings; word_count: maximum; ratio: default at 20%
print(keywords(text))