# Naive Bayes Baseline Model

In [16]:
import pandas as pd
import numpy as np

#Data preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

#Seed for reproducibility
import random

seed_value=42
random.seed(seed_value)
np.random.seed(seed_value)

## Read data 

In [20]:
# Import processed data
df = pd.read_csv("../data/processed/cleaned_data_1.csv", on_bad_lines='skip')
num_nan = df.isna().sum()
df = df.dropna()

In [23]:
possible_labels = df['AuthorID'].unique()
#convert labels into numeric values
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df['label'] = df.AuthorID.replace(label_dict)

labels = list(label_dict.values())

# Get the max len of tweets
max_len = np.max(df['text_len'])

## Data split

In [24]:
# Split data into train, validation, and test datasets
X = df['text_clean']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_value)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=seed_value)

In [8]:
(unique, counts) = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0 4412]
 [   1 5806]
 [   2 6532]
 [   3 3688]
 [   4 1009]
 [   5 4457]
 [   6  652]
 [   7 4394]
 [   8 1370]]


We can conclude from the above cell that the classes are unbalanced. We oversample the training set to enforce the same count of tweets as the most active player on Twitter.

In [25]:
# Re-sample data 

ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(np.array(X_train).reshape(-1, 1), np.array(y_train).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in X_train], y_train)), columns = ['text_clean', 'label'])

X_train = train_os['text_clean'].values
y_train = train_os['label'].values

In [26]:
(unique, counts) = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0 6532]
 [   1 6532]
 [   2 6532]
 [   3 6532]
 [   4 6532]
 [   5 6532]
 [   6 6532]
 [   7 6532]
 [   8 6532]]


## Naive Bayes Model

In [28]:
# CountVectorizer to create bag of words.

clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)
X_test_cv = clf.transform(X_test)

In [29]:
# Apply TF-IDF to attribute weights to words based on their frequency.

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

### Train

In [30]:
nb_clf = MultinomialNB()

# Train model
nb_clf.fit(X_train_tf, y_train)

In [31]:
# Get model predictions for test data
nb_pred = nb_clf.predict(X_test_tf)

In [32]:
labels_string = map(str, labels)
labels_string = list(labels_string)
print('Classification Report for Naive Bayes:\n',classification_report(y_test, nb_pred, target_names=labels_string))

Classification Report for Naive Bayes:
               precision    recall  f1-score   support

           0       0.50      0.59      0.54      1225
           1       0.68      0.54      0.60      1613
           2       0.61      0.43      0.51      1815
           3       0.44      0.54      0.49      1024
           4       0.85      0.75      0.80       280
           5       0.58      0.53      0.55      1238
           6       0.14      0.45      0.22       181
           7       0.57      0.54      0.55      1221
           8       0.33      0.53      0.41       381

    accuracy                           0.53      8978
   macro avg       0.52      0.54      0.52      8978
weighted avg       0.56      0.53      0.54      8978



In [34]:
# Save model

import pickle
f = open('../models/naive_bayes_model.pickle', 'wb')
pickle.dump(nb_clf, f)
f.close()