## Venturing into Doc2Vec

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
documents

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

In [None]:
model = Doc2Vec(documents, vector_size=5, min_count=1, workers=4, epochs = 40)
model.train(documents, total_examples=model.corpus_count,
epochs=model.epochs)



In [None]:
model.vector_size

5

In [None]:
len(model.dv)

9

In [None]:
len(model.wv.key_to_index)

12

In [None]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

In [None]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.0800762   0.07085846 -0.09053659 -0.00673051 -0.05693997]


## Changing vector size and min_count


In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=3, epochs=40)
model.train(documents, total_examples=model.corpus_count,
epochs=model.epochs)



In [None]:
len(model.wv.key_to_index)

4

In [None]:
model.wv.key_to_index

{'system': 0, 'graph': 1, 'trees': 2, 'user': 3}

In [None]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.00777615  0.00735332 -0.00844175 -0.00016482 -0.00579738  0.00063699
 -0.00990451 -0.00872912  0.00460942  0.00236628  0.00145006 -0.00761346
 -0.00413414 -0.00469716 -0.00615075 -0.00696181 -0.00203604  0.0048501
  0.00550309 -0.00276752 -0.00643037  0.00471716 -0.00354644  0.00987764
  0.00881902 -0.00479796 -0.00381807 -0.00202981 -0.00374818 -0.00037171
  0.00795252  0.00617521 -0.00598866 -0.00631139 -0.00640636  0.00809166
  0.00255355 -0.0072995  -0.00345151  0.00498347  0.00860604  0.00216268
  0.00207193  0.00902919  0.00072415 -0.00626899  0.00144841  0.00293647
 -0.00972053  0.00955734]


## The dm parameter for switching between modeling approaches

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)



In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)



## The dm_concat parameter

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05, dm_concat=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)



## The dm_mean parameter

In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)



In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=0, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)



In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)



In [None]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)




## Exploring fastText

## Building a fastText model

In [None]:
from gensim.models import FastText
from gensim.test.utils import common_texts

In [None]:
model = FastText(vector_size=5, window=3, min_count=1)
model.build_vocab(common_texts)
model.train(common_texts, total_examples=len(common_texts), epochs=10)

(36, 290)

In [None]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

In [None]:
model.wv['human']

array([-0.03166137,  0.02326731,  0.01241683,  0.00036033,  0.02841445],
      dtype=float32)

In [None]:
model.wv.most_similar(positive=['computer', 'interface'], negative=['human'])

[('user', 0.7968785762786865),
 ('system', 0.17462188005447388),
 ('response', 0.104334257543087),
 ('survey', 0.009604760445654392),
 ('trees', -0.07640466839075089),
 ('time', -0.1330047994852066),
 ('minors', -0.13927175104618073),
 ('eps', -0.24093686044216156),
 ('graph', -0.291752427816391)]

In [None]:
model = FastText(vector_size=5, window=3, min_count=1, min_n=1, max_n=5)
model.build_vocab(common_texts)
model.train(common_texts, total_examples=len(common_texts),epochs=10)

(36, 290)

In [None]:
model.wv['rubber']

array([ 0.01833104, -0.02146881,  0.00600105, -0.03445042, -0.0165866 ],
      dtype=float32)

In [None]:
model.wv.most_similar(positive=['computer', 'human'], negative=['rubber'])

[('trees', 0.795038104057312),
 ('eps', 0.7793108820915222),
 ('minors', 0.2440604716539383),
 ('time', 0.1623203009366989),
 ('user', -0.04820726439356804),
 ('graph', -0.15672056376934052),
 ('survey', -0.20417772233486176),
 ('interface', -0.3921482563018799),
 ('response', -0.6897355914115906),
 ('system', -0.8435077667236328)]

In [None]:
sentences_to_be_added = [["I", "am", "learning", "Natural", "Language","Processing"],["Natural", "Language", "Processing", "is", "cool"]]
model.build_vocab(sentences_to_be_added, update=True)
model.train(common_texts,
total_examples=len(sentences_to_be_added), epochs=10)



(43, 290)

In [None]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11,
 'I': 12,
 'am': 13,
 'learning': 14,
 'Natural': 15,
 'Language': 16,
 'Processing': 17,
 'is': 18,
 'cool': 19}

## Building a spelling corrector/word suggestion module using fastText

In [None]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import FastText
import io
import collections

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
words = []
data = []
with io.open('comments.txt', 'r', encoding='utf8') as file:
    for entry in file:
        entry = entry.strip()
        data.append(entry)
        words.extend(entry.split())

In [None]:
unique_words = []
unique_words = collections.Counter(words)
unique_words.most_common(10)

[('the', 445892),
 ('to', 288753),
 ('of', 219279),
 ('and', 207335),
 ('a', 201765),
 ('I', 182618),
 ('is', 164602),
 ('you', 157025),
 ('that', 140495),
 ('in', 130244)]

In [None]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)

    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process

    Output : Returns the cleaned text corpus

    '''
    cleaned_corpus = []
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))
    return cleaned_corpus

In [None]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [None]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [None]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [None]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)

    Input :
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer

    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together

    Output : Returns the processed text corpus

    '''

    if cleaning == True:
        corpus = text_clean(corpus)

    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]

    if lemmatization == True:
        corpus = lemmatize(corpus)


    if stemming == True:
        corpus = stem(corpus, stem_type)

    corpus = [' '.join(x) for x in corpus]

    return corpus

In [None]:
data = preprocess(data)

preprocessed_data = []
for line in data:
    if line != "":
        preprocessed_data.append(line.split())

In [None]:
model = FastText(vector_size=50, window=3, min_count=1, min_n=1, max_n=5)

In [None]:
model.build_vocab(preprocessed_data)

In [None]:
len(model.wv.key_to_index)

182228

In [None]:
model.train(preprocessed_data, total_examples=len(preprocessed_data), epochs=10)

(56734609, 58682700)

In [None]:
model.wv.most_similar('eplain', topn=5)

[('reexplain', 0.945229709148407),
 ('eexplain', 0.9441782832145691),
 ('chaplain', 0.9346960783004761),
 ('xplain', 0.9302868247032166),
 ('exlain', 0.9288626909255981)]

In [None]:
model.wv.most_similar('reminder', topn=5)

[('rejoinder', 0.9489542841911316),
 ('remainder', 0.9388765692710876),
 ('reminde', 0.9272560477256775),
 ('bitdefender', 0.9260987043380737),
 ('remaininder', 0.9241107702255249)]

In [None]:
model.wv.most_similar('relevnt', topn=5)

[('relent', 0.9666609168052673),
 ('releveant', 0.9616137742996216),
 ('relevanmt', 0.9603961110115051),
 ('relevent', 0.9521021246910095),
 ('releve', 0.9520982503890991)]

In [None]:
model.wv.most_similar('purse', topn=5)

[('purpse', 0.9334849119186401),
 ('cpurse', 0.9329171180725098),
 ('pure', 0.9308406114578247),
 ('pursuit', 0.9299253821372986),
 ('pulse', 0.9291112422943115)]

## fastText and document distances

In [None]:
sentence_1 = "Obama speaks to the media in Illinois"
sentence_2 = "President greets the press in Chicago"
sentence_3 = "Apple is my favorite company"

In [1]:
!pip install POT


Collecting POT
  Downloading POT-0.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.0/823.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: POT
Successfully installed POT-0.9.3


In [None]:
word_mover_distance = model.wv.wmdistance(sentence_1, sentence_2)
word_mover_distance

0.4417727772904298

In [None]:
word_mover_distance = model.wv.wmdistance(sentence_2, sentence_3)
word_mover_distance

0.5520941682408338