# Doc2Vec

In [1]:
import gensim
import os
import collections
import smart_open
import random

In [2]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [3]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        print(f)
        for i, line in enumerate(f):
            print(gensim.utils.simple_preprocess(line))
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [4]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

<_io.TextIOWrapper name='/home/megan/anaconda3/lib/python3.7/site-packages/gensim/test/test_data/lee_background.cor' mode='r' encoding='iso-8859-1'>
['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearb

['australia', 'has', 'escaped', 'with', 'draw', 'after', 'dramatic', 'final', 'day', 'of', 'the', 'third', 'test', 'against', 'new', 'zealand', 'in', 'perth', 'set', 'to', 'win', 'australia', 'finished', 'the', 'match', 'at', 'for', 'with', 'adam', 'gilchrist', 'not', 'out', 'australia', 'retains', 'the', 'trans', 'tasman', 'trophy', 'after', 'the', 'rain', 'affected', 'series', 'ended', 'during', 'the', 'final', 'day', 'zimbabwean', 'umpire', 'ian', 'robinson', 'made', 'two', 'controversial', 'decisions', 'in', 'favour', 'of', 'australia', 'captain', 'steve', 'waugh', 'and', 'jason', 'gillespie', 'both', 'were', 'given', 'not', 'out', 'despite', 'television', 'replays', 'showing', 'they', 'were', 'caught', 'behind', 'by', 'wicket', 'keeper', 'adam', 'parore']
['israeli', 'forces', 'have', 'launched', 'attacks', 'on', 'some', 'of', 'the', 'key', 'palestinian', 'symbols', 'of', 'autonomy', 'including', 'gaza', 'international', 'airport', 'the', 'strikes', 'come', 'as', 'israeli', 'autho

In [5]:
train_corpus

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

## Preprocesss

In [6]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

import numpy as np

In [7]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    text = re.sub('_',' ',text)
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [8]:
from sklearn.datasets import fetch_20newsgroups

In [9]:
corpus = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'))

In [10]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

In [11]:
def remove_emails(text):
    return re.sub('\S*@\S*\s?', '', text)

In [12]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [13]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

# def replace_numbers(words):
#     """Replace all interger occurrences in list of tokenized words with textual representation"""
#     p = inflect.engine()
#     new_words = []
#     for word in words:
#         if word.isdigit():
#             new_word = p.number_to_words(word)
#             new_words.append(new_word)
#         else:
#             new_words.append(word)
#     return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    #words = replace_numbers(words)
    words = remove_stopwords(words)
    words = [word for word in words if len(word)>2]
    return words

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/megan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def preprocessor(doc):
    return normalize(nltk.word_tokenize(remove_numbers(remove_emails(replace_contractions(denoise_text(doc))))))

In [16]:
corpus.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [19]:
tagged_documents = []
for i in range(len(corpus.data)):
    tagged_documents.append(gensim.models.doc2vec.TaggedDocument(preprocessor(corpus.data[i]), [i]))

  ' Beautiful Soup.' % markup)


In [18]:
tagged_documents

[TaggedDocument(words=['sure', 'bashers', 'pens', 'fans', 'pretty', 'confused', 'lack', 'kind', 'posts', 'recent', 'pens', 'massacre', 'devils', 'actually', 'bit', 'puzzled', 'bit', 'relieved', 'however', 'going', 'put', 'end', 'nonpittsburghers', 'relief', 'bit', 'praise', 'pens', 'man', 'killing', 'devils', 'worse', 'thought', 'jagr', 'showed', 'much', 'better', 'regular', 'season', 'stats', 'also', 'lot', 'fun', 'watch', 'playoffs', 'bowman', 'let', 'jagr', 'lot', 'fun', 'next', 'couple', 'games', 'since', 'pens', 'going', 'beat', 'pulp', 'jersey', 'anyway', 'disappointed', 'see', 'islanders', 'lose', 'final', 'regular', 'season', 'game', 'pens', 'rule'], tags=[0]),
 TaggedDocument(words=['brother', 'market', 'highperformance', 'video', 'card', 'supports', 'vesa', 'local', 'bus', 'ram', 'anyone', 'suggestionsideas', 'diamond', 'stealth', 'pro', 'local', 'bus', 'orchid', 'farenheit', 'ati', 'graphics', 'ultra', 'pro', 'highperformance', 'vlb', 'card', 'please', 'post', 'email', 'than

In [20]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, epochs=40)

In [21]:
model.build_vocab(tagged_documents)

In [24]:
model.corpus_count

18846

In [25]:
%time model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 3min 59s, sys: 4.36 s, total: 4min 4s
Wall time: 2min 4s


In [26]:
ranks = []
second_ranks = []
for doc_id in range(len(tagged_documents)):
    inferred_vector = model.infer_vector(tagged_documents[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])


In [27]:
collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus

Counter({0: 18067,
         2282: 1,
         17829: 1,
         18835: 1,
         16001: 1,
         7710: 1,
         1: 122,
         18536: 1,
         18102: 1,
         18202: 1,
         17338: 1,
         1903: 1,
         16464: 1,
         17287: 1,
         2738: 1,
         2: 20,
         14116: 1,
         16685: 1,
         18748: 1,
         10665: 1,
         3: 14,
         15737: 1,
         9185: 1,
         9900: 1,
         4375: 1,
         12411: 1,
         12173: 1,
         17091: 1,
         10633: 1,
         16774: 1,
         16723: 1,
         16059: 1,
         17613: 1,
         18445: 1,
         17904: 1,
         18749: 1,
         12174: 1,
         15520: 1,
         2860: 1,
         14621: 1,
         14093: 1,
         17513: 1,
         8489: 1,
         10932: 1,
         618: 1,
         1495: 1,
         5572: 1,
         16406: 1,
         4: 8,
         12844: 1,
         17122: 1,
         18771: 1,
         18480: 1,
         15482: 1,

# Visualizations

- t-SNE
- PCA 

