# CS4248 Project Group 23

In [1]:
# If you wish to run this on Google Colab, mount the Google drive by running this cell or click the `files` icon on the left navbar
# and click mount Google Drive (it takes some time to load)
# from google.colab import drive
# drive.mount('/content/drive')

# %cd "/content/drive/My Drive/<The path to this notebook in your Google Drive>"
# !cd "/content/drive/My Drive/<The path to this notebook in your Google Drive>"

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score


In [3]:
# Unzip raw_data.zip locally
import zipfile
with zipfile.ZipFile('raw_data.zip', 'r') as zip_ref:
    zip_ref.extractall()

Feature Engineering: Capture various features of the text (e.g. punctuation, stopwords, statement length). 
Test out different tokenizers to capture their performance.


In [4]:
import re
import string
from nltk.tokenize import word_tokenize

# todo parallelize this in future


from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # tokenize
    words = word_tokenize(text)
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    # Convert list of words back to string
    text = ' '.join(words)
    return text

In [5]:
train = pd.read_csv("./raw_data/fulltrain.csv", names=['Verdict', 'Text'])
test = pd.read_csv("./raw_data/balancedtest.csv", names=['Verdict', 'Text'])

# Preprocess all documents in the corpus
preprocessed_corpus = [preprocess_text(text) for text in train['Text']]

# Fit the TF-IDF vectorizer on the preprocessed corpus
X_train = vectorizer.fit_transform(preprocessed_corpus)
X_test = vectorizer.transform([preprocess_text(text) for text in test['Text']])

y_train = train['Verdict'].apply(lambda x: 1 if x == 4 else 0) # convert to binary- label 4 = trusted
y_test = test['Verdict'].apply(lambda x: 1 if x == 4 else 0) # convert to binary- label 4 = trusted

LookupError: 
**********************************************************************
  Resource [93momw-1.4[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('omw-1.4')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/omw-1.4[0m

  Searched in:
    - '/home/t1dus/nltk_data'
    - '/home/t1dus/anaconda3/nltk_data'
    - '/home/t1dus/anaconda3/share/nltk_data'
    - '/home/t1dus/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
y_train.unique()

array([0, 1], dtype=int64)

Test out different kinds of models and find the most effective architectures.|

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
model = MultinomialNB()


Perform hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# or perform hyperparameter tuning
# Create the hyperparameters grid

param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False],  # Whether to learn class prior probabilities or not
    'class_prior': [None, [0.9, 0.1], [0.8, 0.2], [0.7, 0.3], [0.6, 0.4], [0.5, 0.5], [0.4, 0.6], [0.3, 0.7], [0.2, 0.8], [0.1, 0.9]]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

# Train
grid_search.fit(X_train, y_train)


hyperparams = str(grid_search.best_params_)
print (hyperparams)

# Use best model
model = grid_search.best_estimator_

{'alpha': 0.1, 'class_prior': [0.7, 0.3], 'fit_prior': True}


In [None]:
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Training F1 score is:", f1_score(y_train, y_pred_train))
print("Test F1 score is:", f1_score(y_test, y_pred_test))

Training F1 score is: 0.875887119307069
Test F1 score is: 0.6834782608695652


In [None]:
# save new model
# best model now is {alpha 0.1, class_prior [0.7, 0.3], fit_prior: True}
import joblib
joblib.dump(model, './sklearn_models/Naive Bayes TFIDF ' + str(f1_score(y_test, y_pred_test)) + 'tfidf.pkl')

['./sklearn_models/Naive Bayes TFIDF 0.6834782608695652tfidf.pkl']