<a href="https://colab.research.google.com/github/krystaldowling/MAST30034_Final_Project-/blob/machine_learning_doc2vec/final_classifier_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import pandas as pd
import numpy as np
import scipy

# import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from gensim.test.utils import get_tmpfile

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score


In [4]:
news_data = pd.read_csv("/content/drive/My Drive/Data/final_preproccessed_data.csv")

news_data.index.name = 'index'


In [5]:
# split into training and testing for doc2vec
X_train, X_test, y_train, y_test = train_test_split(news_data['text'], news_data['label'], test_size=0.25, 
                                                        random_state=88)

In [6]:
def tokenise_data(doc):
  # Tokenization of each document and remove punctuation
  tokenized_doc = []
  for d in doc:
    if d not in string.punctuation:
      tokenized_doc.append(word_tokenize(d.lower()))

  return tokenized_doc

In [9]:
train_tokenized_doc = tokenise_data(X_train)

In [10]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(train_tokenized_doc)]

## Train doc2vec model
doc2vec_model = Doc2Vec(tagged_data, vector_size=100, window=2, min_count=1, workers=4, epochs = 100)
# Save trained doc2vec model
doc2vec_model.save("/content/drive/My Drive/Data/trained_doc2vec.model")


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
## Load saved doc2vec model
doc2vec_model= Doc2Vec.load("/content/drive/My Drive/Data/trained_doc2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [23]:
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

In [19]:
train_vectors = np.load("/content/drive/My Drive/Data/trained_doc2vec.model.wv.vectors.npy")

In [25]:
# infer text with trained doc2vec model
X_train_df['doc2vec_text'] = X_train_df['text'].map(lambda x:doc2vec_model.infer_vector(x))
X_test_df['doc2vec_text'] = X_test_df['text'].map(lambda x:doc2vec_model.infer_vector(x))

In [26]:
X_train_df.to_csv('/content/drive/My Drive/Data/doc2vec_train.csv')
X_test_df.to_csv('/content/drive/My Drive/Data/doc2vec_test.csv')

In [27]:
# this is only the text data which has been converted to a data frame.
# each row is one article and each column is an element of the vector. 

train_vec_data = pd.DataFrame(X_train_df['doc2vec_text'].array, columns = range(100))
test_vec_data = pd.DataFrame(X_test_df['doc2vec_text'].array, columns = range(100))

In [30]:
all_vec_data = pd.concat([train_vec_data, test_vec_data])
label_concat = pd.concat([y_train ,y_test])

In [36]:
# grid search for best Log Reg parameters
gs = GridSearchCV(LogisticRegression(), 
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1], 
                         'solver':['newton-cg', 'liblinear', 'saga']}, 
                          scoring="accuracy", cv=4)
gs = gs.fit(all_vec_data, label_concat)
print(gs.best_params_)
print('best score: {:3f}'.format(gs.best_score_))



{'C': 1, 'solver': 'saga'}
best score: 0.758452




In [37]:
lr_best = LogisticRegression(C = 1, solver='saga')
lr_best.fit(train_vec_data, y_train)
y_pred = lr_best.predict(test_vec_data)
acc = lr_best.score(test_vec_data, y_test)
print('accuracy score',  acc)
print('F1 score: {:3f}'.format(f1_score(y_test, y_pred)))

accuracy score 0.7522039875220399
F1 score: 0.708194




In [34]:
# grid search for best SVM kernel
gs = GridSearchCV(SVC(C=1), 
             param_grid={'kernel':['linear', 'poly', 'rbf', 'sigmoid']}, 
                          scoring="accuracy", cv=4)
gs = gs.fit(all_vec_data, label_concat)
print(gs.best_params_)
print('best score: {:3f}'.format(gs.best_score_))

{'kernel': 'rbf'}
best score: 0.781275


In [38]:
svm_best = SVC(C = 1, kernel='rbf')
svm_best.fit(train_vec_data, y_train)
y_pred = svm_best.predict(test_vec_data)
acc = svm_best.score(test_vec_data, y_test)
print('accuracy score',  acc)
print('F1 score: {:3f}'.format(f1_score(y_test, y_pred)))

accuracy score 0.7796012477960125
F1 score: 0.747553
