# Vectorization with Doc2Vec

In [3]:
import pandas as pd
import numpy as np

import pickle

from sklearn import utils
from sklearn.metrics import classification_report

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import OrderedDict
import multiprocessing

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

In [7]:
with open('../Capstone_Final/Capstone_Public/Data/tweets2_df', 'rb') as f:
    tweets = pickle.load(f)

In [8]:
tweets.head()

Unnamed: 0,user,date,text
0,rocknlau19,2018-10-22 10:37:23,"School may be out this week, but the cast of #..."
1,abdullahgrms,2018-10-22 10:35:54,A new scene between Newt and Leta in #Fantasti...
2,Baxx_94,2018-10-22 10:31:07,".@jk_rowling stopped by the @TODAYshow in NY, ..."
3,MoviesSwords,2018-10-22 10:31:07,The cast of #FantasticBeasts: The Crimes of Gr...
4,MoviesDundrum,2018-10-22 10:31:02,The cast of #FantasticBeasts: The Crimes of Gr...


*You will want to preprocess the text before vectorization normally*

**Step 1: Tag Documents**

In [9]:
tagged_documents = []
for indx, doc in enumerate(tweets["text"].values):
    tagged_documents.append(TaggedDocument([x for x in doc.split()], [indx]))

**Step 2: Define Models**

In [10]:
cores = multiprocessing.cpu_count()
vec_size = 300

model_dbow = Doc2Vec(dm=0, dbow_words=1, vector_size=vec_size, negative=5, hs=0, min_count=2, sample=0, 
             workers=cores)

model_dm_mean = Doc2Vec(dm=1, dm_mean=1, vector_size=vec_size, window=10, negative=5, hs=0, min_count=2, sample=0, 
                workers=cores, alpha=0.05, comment='alpha=0.05')

model_dm_concat = Doc2Vec(dm=1, dm_concat=1, vector_size=vec_size, window=5, negative=5, hs=0, min_count=2, sample=0, 
                  workers=cores)

**Step 3: Build Vocabulary**

In [11]:
models = [(model_dbow, 'model_dbow'), (model_dm_mean, 'model_dm_mean'), (model_dm_concat, 'model_dm_concat')]

for model in models:
    model[0].build_vocab(tagged_documents)
    print("%s vocabulary scanned & state initialized" % model[0])
    
models_by_name = OrderedDict((str(model[1]), model[0]) for model in models)

Doc2Vec(dbow+w,d300,n5,w5,mc2,t4) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d300,n5,w10,mc2,t4) vocabulary scanned & state initialized
Doc2Vec(dm/c,d300,n5,w5,mc2,t4) vocabulary scanned & state initialized


**Step 4: Train the Models**

In [12]:
for model in models:
    for epoch in range(5):
        print('Epoch: {0}'.format(epoch), 'Model: %s' % (model[0]))
        model[0].train(utils.shuffle(tagged_documents), total_examples=len(tagged_documents), epochs=1)
        model[0].alpha -= 0.002
        model[0].min_alpha = model[0].alpha

Epoch: 0 Model: Doc2Vec(dbow+w,d300,n5,w5,mc2,t4)
Epoch: 1 Model: Doc2Vec(dbow+w,d300,n5,w5,mc2,t4)
Epoch: 2 Model: Doc2Vec(dbow+w,d300,n5,w5,mc2,t4)
Epoch: 3 Model: Doc2Vec(dbow+w,d300,n5,w5,mc2,t4)
Epoch: 4 Model: Doc2Vec(dbow+w,d300,n5,w5,mc2,t4)
Epoch: 0 Model: Doc2Vec("alpha=0.05",dm/m,d300,n5,w10,mc2,t4)
Epoch: 1 Model: Doc2Vec("alpha=0.05",dm/m,d300,n5,w10,mc2,t4)
Epoch: 2 Model: Doc2Vec("alpha=0.05",dm/m,d300,n5,w10,mc2,t4)
Epoch: 3 Model: Doc2Vec("alpha=0.05",dm/m,d300,n5,w10,mc2,t4)
Epoch: 4 Model: Doc2Vec("alpha=0.05",dm/m,d300,n5,w10,mc2,t4)
Epoch: 0 Model: Doc2Vec(dm/c,d300,n5,w5,mc2,t4)
Epoch: 1 Model: Doc2Vec(dm/c,d300,n5,w5,mc2,t4)
Epoch: 2 Model: Doc2Vec(dm/c,d300,n5,w5,mc2,t4)
Epoch: 3 Model: Doc2Vec(dm/c,d300,n5,w5,mc2,t4)
Epoch: 4 Model: Doc2Vec(dm/c,d300,n5,w5,mc2,t4)
