In [None]:
# Read preprocessed articles and tags
import pandas as pd
import re

fa = open('new_preprocessed_articles.txt', 'r', encoding='utf8')
ft = open('new_preprocessed_articles_tags.txt', 'r', encoding='utf8')

tags_set = set()
tags_for_articles = []

processed_articles = {'content': [re.sub('\\n', '', str(article)) for article in fa], 'tags': []}
for tag in ft:
    corresponding_tags = re.split(', ', str(tag).replace('[', '').replace(']', '').replace('\n', ''))
    processed_articles['tags'].append(corresponding_tags)
    tags_set.update(corresponding_tags)
    tags_for_articles.extend(corresponding_tags)

fa.close()
ft.close()

df_articles = pd.DataFrame(data=processed_articles)

# calculate tag score by its frequency distribution. score = occurrance / total number of article tags
tag_scores = {}
for tag in tags_set:
    tag_scores[tag] = tags_for_articles.count(tag)/len(tags_for_articles)

del tags_for_articles
del tags_set

# calculate article score according to the sum of its tag score and sort the articles by score
score = []
for i, row in df_articles.iterrows():
    score.append(sum([tag_scores[tag] for tag in row['tags']]))
df_articles['score'] = score
df_articles.sort_values(by='score', ascending=False)

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import doc2vec
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from tqdm import tqdm
import numpy as np

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the article.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [None]:
mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(df_articles.tags)

X_train, X_test, y_train, y_test = train_test_split(df_articles.content, binarized_labels, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

## Train the model

We'll instantiate a Doc2Vec model-Distributed Bag of Words (DBOW). In the Word2Vec architecture, the two algorithm names are “continuous bag of words” (cbow) and “skip-gram” (sg); in the Doc2Vec architecture, the corresponding algorithms are “distributed bag of words” (dbow) and “distributed memory” (dm).

## DBOW

DBOW is the Doc2Vec model analogous to Skip-gram model in Word2Vec. The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph.

Training a Doc2Vec model is rather straight forward in Gensim, we initialize the model and train for 30 epochs:

dm=0 means ‘distributed bag of words’ (DBOW), set min_count=2 means ignoring all words with total frequency lower than this, size=300 is dimensionality of the generated feature vectors, alpha=0.065 is the initial alpha rate, learning rate will linearly drop to min_alpha as training progresses. And then we build a vocabulary

In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=2, epoches=30, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

In [None]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [None]:
model_dbow.save('d2v_dbow_model.doc2vec')

## Distributed Memory

Distributed Memory (DM) acts as a memory that remembers what is missing from the current context — or as the topic of the paragraph. While the word vectors represent the concept of a word, the document vector intends to represent the concept of a document.

We again instantiate a Doc2Vec model with a vector size with 300 words and iterating over the training corpus 30 times

In [None]:
model_dm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=2, epoches=30, workers=5, alpha=0.065, min_alpha=0.065)
model_dm.build_vocab([x for x in tqdm(all_data)])

In [None]:
%%time
for epoch in range(30):
    model_dm.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dm.alpha -= 0.002
    model_dm.min_alpha = model_dm.alpha

In [None]:
model_dbow.save('d2v_dm_model.doc2vec')

### Get document vectors from doc2vec model

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [None]:
# model_dbow = Doc2Vec.load('d2v_dbow_model.doc2vec')
# model_dm = Doc2Vec.load('d2v_dm_model.doc2vec')

train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

train_vectors_dm = get_vectors(model_dm, len(X_train), 300, 'Train')
test_vectors_dm = get_vectors(model_dm, len(X_test), 300, 'Test')

## Model pairing

combining a paragraph vector from Distributed Bag of Words (DBOW) and Distributed Memory (DM) improves performance

In [None]:
# Free memory
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [None]:
def get_concat_vectors(model1, model2, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = np.append(model1.docvecs[prefix], model2.docvecs[prefix])
    return vectors

In [None]:
train_vecs_dbow_dm = get_concat_vectors(model_dbow, model_dm, len(X_train), 600, 'Train')
test_vecs_dbow_dm = get_concat_vectors(model_dbow, model_dm, len(X_test), 600, 'Test')

### Train the classifier. Here we try to use SGDClassifier for large-scale corpus datasets

In [None]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import TruncatedSVD

classifier = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lsa", TruncatedSVD(n_components=1000)),
    ('clf', OneVsRestClassifier(SGDClassifier(), n_jobs=-1))
])

classifier.fit(train_vecs_dbow_dm, y_train)

In [None]:
predicted = classifier.predict(test_vecs_dbow_dm)

In [None]:
from sklearn.metrics import f1_score, accuracy_score


print "Accuracy score: %.5f" % accuracy_score(y_test, predicted)
print "F1 score: %.5f" % f1_score(y_test, predicted, average='micro')