In [1]:
import pandas as pd
import numpy as np
import logging
import smart_open
import requests as req
import re
import os
import tarfile
import gensim

import gensim
import gensim.downloader as api
from gensim.utils import save_as_line_sentence
from gensim.models.word2vec import Word2Vec
from gensim.models.word2vec import Word2Vec

print(gensim.models.word2vec.CORPUSFILE_VERSION)  # must be >= 0, i.e. optimized compiled version

1


This tutorial will explore some of the basic aspects of the very popular doc2vec technique. The tutorial can be divided on the following sections:

1. Explanation of what doc2vec is
2. discussion of the different implementations of doc2vec in gensim
3. use of the imdb databsase
4. clusters different versions
5. addendum and exploration of other wordvectors.

In [2]:
# get current directory to save it to a string with info about our local directory
path=os.getcwd()
path

'/media/jlealtru/data_files/github/nlp_experiments'

In [3]:
os.path.isdir(path+'/aclImdb')

True

In [4]:
# We will check if the data already exist in our local enviroment. If not we will download it from the Standford
# webpage

if os.path.isdir(path+'/aclImdb'):
    print('Files available for work')
else:
    print("Downloading IMDB archive...")
    url = u'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    r = req.get(url)
    with smart_open('standford', 'wb') as f:
        f.write(r.content)
    print('file download complete')
    # if error here, try `tar xfz aclImdb_v1.tar.gz` outside notebook, then re-run this cell
    tar = tarfile.open('standford', mode='r')
    tar.extractall()
    tar.close()
    print('extraction complete, files available for work')


Files available for work


In [5]:
# We now need to parse the files we just downloaded into our machine. We define a function that
# looks into a folder and parses the contents into a list. We also need to define the sentiment for
# the classification.
def get_review_content(path_to_folder, sentiment):
    contents=[]
    for filename in os.listdir(path_to_folder+sentiment):
        with open(path_to_folder+sentiment+'/'+filename, 'rb') as openfile:
            content=openfile.read()
            contents.append(content)
    return contents


In [6]:
# start with the negative freviews
contents_negative_train=get_review_content(path+'/aclImdb/train/', 'neg')
contents_negative_test=get_review_content(path+'/aclImdb/test/', 'neg')
len(contents_negative_test+contents_negative_train)

25000

In [7]:
# we repeat with the positive reviews
contents_positive_train=get_review_content(path+'/aclImdb/train/', 'pos')
contents_positive_test=get_review_content(path+'/aclImdb/test/', 'pos')
len(contents_positive_test+contents_positive_train)

25000

In [8]:
# we now parse the text with undefined 
contents_undefined=get_review_content(path+'/aclImdb/train/', 'unsup')
len(contents_undefined)

50000

In [9]:
# we can do this whole thing across a number of processors 
# initialize pool of processes
import multiprocessing
p = multiprocessing.cpu_count()

print(p)

16


In [14]:
# we are going to develop a pipeline with Spacy to clean and process the data. First we need to load the large
# english model.
import spacy
nlp = spacy.load('en_core_web_lg')

The main characteristic of SpaCy is the use of the Doc class to hold the documents we will analyse.

In [29]:
nlp(contents_positive_train[11])

TypeError: Argument 'string' has incorrect type (expected str, got bytes)

In [36]:
contents_positive_train[11]

b"I am not a parent, neither am I a male. But I was able to identify with every character's heartaches and pains.<br /><br />This is a movie teenagers should watch. Maybe that way they will start appreciating the value of family again. I'm sorry for those that don't understand the value of love, family and friendship.<br /><br />It was very interesting to watch Patrick Duffy in a different role than that of Bobby Ewing. And it is great to see a 19 year old Ben Affleck giving his best in a moving and sincere performance. He showed at an early age, that he is capable of heartfelt drama. He should be offered more serious roles. Note Hollywoodland... his first serious role in years and he went out and won Best Actor at the Venice Festival in 2006.<br /><br />This movie can be appreciated by people of all ages. Maybe shouldn't be watched by children under 10 because they might get scared that the same may happen to their families, but I recommend it to the entire family.<br /><br />I bought

In [35]:
test_=[token.lemma_ for token in nlp(contents_positive_train[11].decode("utf-8"))]
test_

['-PRON-',
 'be',
 'not',
 'a',
 'parent',
 ',',
 'neither',
 'be',
 '-PRON-',
 'a',
 'male',
 '.',
 'but',
 '-PRON-',
 'be',
 'able',
 'to',
 'identify',
 'with',
 'every',
 'character',
 "'s",
 'heartache',
 'and',
 'pains.<br',
 '/><br',
 '/>this',
 'be',
 'a',
 'movie',
 'teenager',
 'should',
 'watch',
 '.',
 'maybe',
 'that',
 'way',
 '-PRON-',
 'will',
 'start',
 'appreciate',
 'the',
 'value',
 'of',
 'family',
 'again',
 '.',
 '-PRON-',
 'be',
 'sorry',
 'for',
 'those',
 'that',
 'do',
 'not',
 'understand',
 'the',
 'value',
 'of',
 'love',
 ',',
 'family',
 'and',
 'friendship.<br',
 '/><br',
 '/>it',
 'be',
 'very',
 'interesting',
 'to',
 'watch',
 'patrick',
 'duffy',
 'in',
 'a',
 'different',
 'role',
 'than',
 'that',
 'of',
 'bobby',
 'ewing',
 '.',
 'and',
 '-PRON-',
 'be',
 'great',
 'to',
 'see',
 'a',
 '19',
 'year',
 'old',
 'ben',
 'affleck',
 'give',
 '-PRON-',
 'good',
 'in',
 'a',
 'move',
 'and',
 'sincere',
 'performance',
 '.',
 '-PRON-',
 'show',
 'at',


In [25]:
contents_positive_train[11]

b"I am not a parent, neither am I a male. But I was able to identify with every character's heartaches and pains.<br /><br />This is a movie teenagers should watch. Maybe that way they will start appreciating the value of family again. I'm sorry for those that don't understand the value of love, family and friendship.<br /><br />It was very interesting to watch Patrick Duffy in a different role than that of Bobby Ewing. And it is great to see a 19 year old Ben Affleck giving his best in a moving and sincere performance. He showed at an early age, that he is capable of heartfelt drama. He should be offered more serious roles. Note Hollywoodland... his first serious role in years and he went out and won Best Actor at the Venice Festival in 2006.<br /><br />This movie can be appreciated by people of all ages. Maybe shouldn't be watched by children under 10 because they might get scared that the same may happen to their families, but I recommend it to the entire family.<br /><br />I bought

In [16]:
text = (u"When Sebastian Thrun started working on self-driving cars at "
        u"Google in 2007, few people outside of the company took him "
        u"seriously. “I can tell you very senior CEOs of major American "
        u"car companies would shake my hand and turn away because I wasn’t "
        u"worth talking to,” said Thrun, now the co-founder and CEO of "
        u"online higher education startup Udacity, in an interview with "
        u"Recode earlier this week.")
doc = nlp(text)

In [21]:
for entity in doc.ents:
    if entity.label_=='PERSON':
        print(entity.text, entity.label_)

Sebastian Thrun PERSON
Thrun PERSON
Udacity PERSON
Recode PERSON


In [None]:
nlp.add_pipe()

In this case we have 16 processorss we can use in our model.

In [10]:
# We will clean the reviews to strip text from stopwords, remove non alphanumeric characters, stemming etc. 
# We also  make use of the multiprocessing library to parallelize the workload.

from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_non_alphanum


# define the function to clen the text.
def clean_text(text):
    # filter ords greater than 120
    #wl = filter(lambda x: ord(x) < 128, text)

    # pass through pre-processing filter
    wl = preprocess_string(text, filters = [stem_text, strip_numeric, 
                                          strip_punctuation, remove_stopwords, 
                                          strip_short, strip_non_alphanum])
    return wl
                    
    #clean_text = p.map(clean_text, labeled['review'].values.tolist())

In [11]:
# we now check if we are using the fast version of Gensim that optimizes training times
assert gensim.models.doc2vec.FAST_VERSION> -1

In [12]:
# we now clean the text using the predefined function
with multiprocessing.Pool(processes=8) as pool:
    undefined_clean=pool.map(clean_text, contents_undefined)
    train_positive_clean=pool.map(clean_text, contents_positive_train)
    train_negative_clean=pool.map(clean_text, contents_negative_train)

Discussion of the new interface of gensim and how if saves file in special format to speed up 
calculations of word vectors specially after using more than 8 cores.

In [88]:
#undefined_clean[0]

In [13]:
# serialize the preprocessed corpus into a single file on disk, using memory-efficient streaming
# we do this for unlabeled data, train and test.
from gensim.utils import save_as_line_sentence
#from smart_open import save_as_line_sentence
save_as_line_sentence(undefined_clean, path+'/undefined_clean.txt')
save_as_line_sentence(train_positive_clean, path+'/train_positive_clean.txt')
save_as_line_sentence(train_negative_clean, path+'/tran_negative_clean.txt')


In [None]:

#we now create a dataframe with the reviews and the sentiment.
imdb_train=pd.DataFrame({'reviews':contents_positive_train+contents_undefined})
len(imdb_train)
#imdb_train.head()


In [55]:
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument
import time

start_time = time.time()
model_sent = Doc2Vec(corpus_file=path+'/undefined_clean.txt', epochs=25, vector_size=200, 
                     workers=16,dm=1, dm_mean=1, alpha=0.01, seed=27)
sent_time = time.time() - start_time

In [22]:
def vec_for_learning(model, tagged_docs): 
    sents = tagged_docs.values 
    targets, regressors = zip(*[(doc.tags[0],
                                 model.infer_vector(doc.words, steps=20)) for doc in sents])     return targets, regressors

28.746666193008423

In [84]:
#model_sent.docvecs.most_similar(4)
#contents_undefined[4]
#contents_undefined[38715]
#from scipy.spatial.distance import cosine

a=model_sent.infer_vector(doc_words=b,epochs=30, steps=50,alpha=0.0025 )
model_sent.docvecs.most_similar([a])

[(0, 0.760078489780426),
 (27582, 0.6653116345405579),
 (47462, 0.6626892685890198),
 (24728, 0.6495351791381836),
 (28954, 0.6429033875465393),
 (43891, 0.6380195617675781),
 (29857, 0.6364273428916931),
 (11530, 0.6309794783592224),
 (1871, 0.6225343942642212),
 (11240, 0.6160120964050293)]

In [87]:
gensim.models.keyedvectors.BaseKeyedVectors.distance(a, model_sent.docvecs[0])

TypeError: distance() missing 1 required positional argument: 'entity2'

In [78]:
infile = open(path+'/undefined_clean.txt', 'r')
firstLine = infile.readline()
b=firstLine.split()
#==undefined_clean[0]

In [None]:
# we now use then new interface of gensim to save the data into disk and then feed that information to the model
import itertools
from gensim.parsing.preprocessing import preprocess_documents

# defined a function to stream the clean text 
def processed_corpus(text_to_save):
    for index,article in enumerate(text_to_save):
        # concatenate all section titles and texts of each Wikipedia article into a single "sentence"
        print (index)
        doc = '\n'.join(itertools.chain.from_iterable(zip('undefined'+str(index), article)))
        yield (doc)
        print(doc)