# Vector Space Modelling for text : tf-idf and doc2vec

In [252]:
#import all modules
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from gensim.models import doc2vec
from collections import namedtuple

# TF-IDF LOGISTIC REGRESSION

In [34]:
def get_tag_and_training_data(filename):
    '''takes the input file and returns  tokenized sentences and document tags as separate lists'''
    tags=[]
    documents=[]
    line_counter=1
    with open(filename) as f:
        for line in f:
            #skip first line
            if line_counter==1:
                line_counter=line_counter+1
                continue
            #Initialize the token list for line
            tags.append(int(line[:1]))
            documents.append(line[2:])
    return tags,documents
        

In [35]:
Y,X=get_tag_and_training_data('Data/trainingdata.txt')

In [86]:
#75:25 training test split
Y_train,Y_test=Y[:4120],Y[4120:]
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X)
freq_term_matrix = count_vectorizer.transform(X)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.transform(freq_term_matrix)

In [96]:
#train logistic regression model
X_train,X_test=tf_idf_matrix[:4120],tf_idf_matrix[4120:]
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train,Y_train)
pred=logreg.predict(X_test)
accuracy_score(Y_test, pred)

0.95457875457875463

In [101]:
# 2nd Category is BUSINESS NEWS,so lets test out a news peice on TESLA
logreg.predict(tfidf.transform((count_vectorizer.transform(["Tesla Inc. said on Friday it had raised about $1.2-billion (U.S.), roughly 20 per cent more than it had planned, by selling common shares and convertible debt, ahead of the launch of the crucial Model 3 sedan."]))))

array([2])

# TF-IDF NAIVE BAYES

In [261]:
#initialize the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train,Y_train)
nb_pred=clf.predict(X_test)
accuracy_score(Y_test, nb_pred)

0.73626373626373631

# Doc2Vec Logisitc Regression

In [244]:
# data already loaded as lists of sentences in X and Y

docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(X):
    words = text.lower().split()
    tags = [i]
    docs.append(analyzedDocument(words, tags))

# Train model (set min_count = 1, if you want the model to work with the provided example data set)
model = doc2vec.Doc2Vec(docs, size = 160, window = 10, min_count = 7, workers = 4)

In [245]:
#making training and test sets
wb_Y_train,wb_Y_test=Y_train,Y_test
wb_X=[]
for i in range(len(X)):
    wb_X.append(model.docvecs[i])
wb_X_train=wb_X[:4120]
wb_X_test=wb_X[4120:]
        

# Word Embeddings Logistic Regression 

In [249]:
wb_logreg = linear_model.LogisticRegression(C=1e4)
wb_logreg.fit(wb_X_train,wb_Y_train)
wb_pred=wb_logreg.predict(wb_X_test)
accuracy_score(wb_Y_test, wb_pred)

0.76190476190476186

# Word Embeddings Naive Bayes

In [262]:
wb_clf = GaussianNB()
wb_clf.fit(wb_X_train,wb_Y_train)
wb_nb_pred=wb_clf.predict(wb_X_test)
accuracy_score(wb_Y_test, wb_nb_pred)

0.58388278388278392