# Doc2Vec Embedding

Text Cleaning Step:
1.	Delete unreliable articles
2.	Delete stopwords (using NLTK package)
3.	Remove non-word characters: only keep letters and numbers
4.	Lower all letters
5.  Concat Title and Text

In [111]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np

In [81]:
file = open('2018_07_19_04_59_08/articles.txt', 'r', encoding = 'utf8')

In [82]:
#articles.txt is a file containing article records, one record per row, column definition is:
columns = ["pubId", "is_hourly", "seqId", "on_homepage", "canonicalUrl",
                   "firstScrape", "lang_iso", "lang_reliability", "title", "text"]
articles_dt = file.read().split('\n')[:-1]

In [83]:
pubId, canonicalUrl,firstScrape,title,text,lang_reliability = [],[],[],[],[],[]
for article in articles_dt:    
    row = article.split('\t')
    pubId.append(row[0])
    canonicalUrl.append(row[4])
    firstScrape.append(row[5])
    lang_reliability.append(row[7])
    title.append(row[8])
    text.append(row[9])
articles_df = pd.DataFrame()

In [84]:
articles_df['pubId'], articles_df['canonicalUrl'], articles_df['firstScrape'], articles_df['title'], articles_df['text'], articles_df['lang_reliability']= pubId, canonicalUrl,firstScrape,title,text,lang_reliability

In [85]:
articles_df.shape

(213605, 6)

In [86]:
#exclude the lang_reliability = 0 which the detected language is not reliable
articles_df = articles_df[articles_df['lang_reliability'] == '1']
articles_df.shape

(211577, 6)

In [88]:
articles_df['title_text'] = articles_df['title'] + ' '+ articles_df['text']

In [91]:
## text normzalization
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a paragrapy
        
        return: modified initial string
    """
    text = re.sub(r"[^\w\s]", '', text) ## Remove all non-word characters (everything except numbers and letters)
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    return text
    
articles_df['text'] = articles_df['text'].apply(clean_text)
articles_df['title_text'] = articles_df['title_text'].apply(clean_text)

In [92]:
title_text_df = articles_df[['title_text']]

In [10]:
text_df = articles_df[['text']]

In [19]:
text_df.to_csv('text_df.csv')

In [99]:
title_text_df.to_csv('title_text_df.csv')

## Doc2Vec

In [14]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [56]:
mini = text_df.head()

In [58]:
tagged_data = [Tagge dDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(text_df.text)]

In [61]:
len(tagged_data)

211577

In [65]:
max_epochs = 20
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
Model Saved


In [75]:
print(model.docvecs[3])

[-0.04742454 -3.9067266   0.6358228   0.75249374  1.7423975   0.9739114
  0.7258888  -0.41517594  3.0489283  -2.9445236  -0.28707018 -0.79701144
  4.063946    1.6476473  -0.5928256   1.7204026  -4.7911644   1.914418
  1.8326075   0.27115056 -0.75462824 -2.2386963   1.6526892  -1.9546459
  0.7244203   1.4253081  -0.5431972  -1.7073781  -0.61894894 -3.1310725
  2.6362584   2.584425    1.6833615  -0.25219202  0.9889768   1.0735435
  1.9781995   2.0766425   2.4369762  -1.143024   -0.46532598 -0.02445841
 -0.50257665  4.0439854   2.68998     1.2311356  -1.826763   -0.9050933
  0.93818873 -0.7991395   0.54384965  0.55795205 -1.343361   -1.3555621
  1.5750258   3.9228294   0.5858833  -2.3397343   1.7796873   2.248494
 -0.36234215  0.8671693  -2.360166    0.05404849 -3.21477    -2.087069
  2.2954795   0.81484246 -1.8066285   1.2576919   0.9409692   1.4169419
  5.494096   -1.8293006   1.9247261   0.23652166  3.796841    2.6306183
 -0.6738159  -2.817919    1.8070489   1.5071226   1.7859106   0.0

In [77]:
import pandas as pd
list_of_lists = []
for i in range(text_df.shape[0]):
    list_of_lists.append(model.docvecs[i])

d2v_vec20 = pd.DataFrame(list_of_lists)

In [79]:
d2v_vec20.to_csv('./result/d2v_vec100.csv')

**vec_size=100**

In [72]:
max_epochs = 20
vec_size = 100
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")



iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
Model Saved


In [None]:
import pandas as pd
list_of_lists = []
for i in range(text_df.shape[0]):
    list_of_lists.append(model.docvecs[i])

d2v_vec100 = pd.DataFrame(list_of_lists)

In [None]:
d2v_vec100.to_csv('./result/d2v_vec100.csv')

**Concat title and text**

In [101]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(title_text_df.title_text)]

In [105]:
def doc2vec(max_epochs, vec_size, alpha, tagged_data):
    model = Doc2Vec(size=vec_size,
                    alpha=alpha, 
                    min_alpha=0.00025,
                    min_count=1,
                    dm =1)

    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha

    model.save("d2v.model")
    print("Model Saved")
    return model

In [106]:
model_titletext = doc2vec(20, 100, 0.025, tagged_data)

iteration 0


  


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
Model Saved


In [107]:
print(model.docvecs[3])

[-0.04742454 -3.9067266   0.6358228   0.75249374  1.7423975   0.9739114
  0.7258888  -0.41517594  3.0489283  -2.9445236  -0.28707018 -0.79701144
  4.063946    1.6476473  -0.5928256   1.7204026  -4.7911644   1.914418
  1.8326075   0.27115056 -0.75462824 -2.2386963   1.6526892  -1.9546459
  0.7244203   1.4253081  -0.5431972  -1.7073781  -0.61894894 -3.1310725
  2.6362584   2.584425    1.6833615  -0.25219202  0.9889768   1.0735435
  1.9781995   2.0766425   2.4369762  -1.143024   -0.46532598 -0.02445841
 -0.50257665  4.0439854   2.68998     1.2311356  -1.826763   -0.9050933
  0.93818873 -0.7991395   0.54384965  0.55795205 -1.343361   -1.3555621
  1.5750258   3.9228294   0.5858833  -2.3397343   1.7796873   2.248494
 -0.36234215  0.8671693  -2.360166    0.05404849 -3.21477    -2.087069
  2.2954795   0.81484246 -1.8066285   1.2576919   0.9409692   1.4169419
  5.494096   -1.8293006   1.9247261   0.23652166  3.796841    2.6306183
 -0.6738159  -2.817919    1.8070489   1.5071226   1.7859106   0.0

In [108]:
import pandas as pd
list_of_lists = []
for i in range(text_df.shape[0]):
    list_of_lists.append(model.docvecs[i])

d2v_vec100_titletext = pd.DataFrame(list_of_lists)

In [109]:
d2v_vec100_titletext.shape

(211577, 100)

In [110]:
d2v_vec100.to_csv('./result/d2v_vec100_titletext.csv')