# Naive Bayes Classifier

### Import relevant libraries

In [None]:
# import relevant libraries

# for data manipulation
import numpy as np
import pandas as pd

# split data into training, validation and test set
from sklearn.model_selection import train_test_split

# for counter
from tqdm import tqdm

# for classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# for oversampling
from imblearn.over_sampling import SMOTE

### Import data after pre-processing

In [None]:
# import the data
df = pd.read_csv('df_tokenized.csv')

In [None]:
# check shape
df.shape

(660645, 2)

### Split into training, validation and test set

##### Check dataframe

In [None]:
df.head()

Unnamed: 0,tokens,sentiment
0,"['great', 'locat', 'close', 'main', 'public', ...",positive
1,"['famili', 'four', 'thi', 'flat', 'can', 'acco...",positive
2,"['place', 'wonder', 'plenti', 'room', 'us', 'h...",positive
3,"['great', 'locat', 'truli', 'onli', 'coupl', '...",positive
4,"['great', 'place', 'perfect', 'weekend', 'not'...",positive


In [None]:
df['sentiment'].value_counts()

sentiment
positive    654007
negative      6638
Name: count, dtype: int64

In [None]:
y = np.where(df['sentiment']=='positive',1,0)

##### Split 60:20:20

In [None]:
# create test set
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# create training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# check distributions
print(y_train.sum()/y_train.shape[0])
print(y_val.sum()/y_val.shape[0])
print(y_test.sum()/y_test.shape[0])

0.9900753556499078
0.9895859349575037
0.9899492162961954


There is a strong imbalance, thus, some techniques against imbalance have to be applied.

In [None]:
# before that we will use grid-search 5-fold cross validation to find the optimum value for alpha therefore no validation set is needed
X_train = pd.concat([X_train,X_val])
y_train = np.append(y_train,y_val)

### Create feature space (DTM)

A computer cannot deal with raw text data, the texts have to be converted to feature space.

In [None]:
# initiate CountVectorizer
vectorizer = CountVectorizer()

In [None]:
# learn the vocabulary from training set
vectorizer.fit(X_train)

In [None]:
# create document-term matrix
X_train = vectorizer.transform(X_train)

In [None]:
# transform also test set to document-term matrix
X_test = vectorizer.transform(X_test)

In [None]:
# check the first features of the document-term matrix
vectorizer.get_feature_names_out()[1000:1010]

array(['ac_help', 'ac_host', 'ac_hot', 'ac_howev', 'ac_huge', 'ac_kept',
       'ac_live', 'ac_live_room', 'ac_make', 'ac_much'], dtype=object)

In [None]:
# check amount of unique vocabulary 
vectorizer.get_feature_names_out().shape

(302023,)

In [None]:
difference = y_train.shape[0]-y_train.sum()

In [None]:
y_train.shape

(528516,)

### Apply oversampling to tackle class imbalance 

##### Oversampling

Oversample the minority class so that there are more observations of it. Create as many so that half of the documents have negative sentiment.

In [None]:
# oversample the minority class
oversampler = SMOTE(sampling_strategy='minority', random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [None]:
# check new ratio of sentiment labels
print(y_train.sum()/y_train.shape[0])

0.5


### Build NB Classifier

First, build the Naive Bayes with Laplace smoothing.

In [None]:
# build Multinomial NB classifier, using a uniform prior distribution
nb_classifier = MultinomialNB(fit_prior=True)

# train the classifier
nb_classifier.fit(X_train, y_train)

In [None]:
# predict on the test set
y_pred = nb_classifier.predict(X_test)

In [None]:
# evaluate model performance
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_pred, average='macro'))
print("F1: ", f1_score(y_test, y_pred, average='macro'))

Accuracy:  0.9907665993082518
Precision:  0.7623896407287578
Recall:  0.895457269633269
F1:  0.8150273701977078


In [None]:
# look at the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.80      0.63      1328
           1       1.00      0.99      1.00    130801

    accuracy                           0.99    132129
   macro avg       0.76      0.90      0.82    132129
weighted avg       0.99      0.99      0.99    132129



In [None]:
# look at the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  1060,    268],
       [   952, 129849]])

In [None]:
# extract most important features for each label
for i, label in enumerate(nb_classifier.classes_):
    top = np.argsort(nb_classifier.feature_log_prob_[i])[-20:]
    print("Label %s: %s" % (label, ", ".join(vectorizer.get_feature_names_out()[top])))
    print()

Label 0: night, good, bed, check, airbnb, get, would, us, clean, locat, host, place, no, room, stay, veri, but, apart, thi, not

Label 1: good, perfect, everyth, us, well, but, walk, love, would, thi, nice, recommend, clean, host, place, apart, locat, great, stay, veri



##### Grid-search to optimize alpha value

In [None]:
# build Naive Bayes classifier
nb_classifier = MultinomialNB()

In [None]:
# define hyperparameters to tune
param_grid = {
    # alpha is the smoothing parameter for Naive Bayes
    'alpha': [1,5,10,15],
    'fit_prior': [True]
}

# grid search with 5-fold cross validation
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring='f1_macro', refit=True, return_train_score=True, verbose=3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END alpha=1, fit_prior=True;, score=(train=0.977, test=0.968) total time=   0.9s
[CV 2/5] END alpha=1, fit_prior=True;, score=(train=0.935, test=0.936) total time=   0.9s
[CV 3/5] END alpha=1, fit_prior=True;, score=(train=0.974, test=0.975) total time=   0.9s
[CV 4/5] END alpha=1, fit_prior=True;, score=(train=0.974, test=0.974) total time=   0.9s
[CV 5/5] END alpha=1, fit_prior=True;, score=(train=0.974, test=0.975) total time=   0.9s
[CV 1/5] END alpha=5, fit_prior=True;, score=(train=0.973, test=0.968) total time=   0.9s
[CV 2/5] END alpha=5, fit_prior=True;, score=(train=0.930, test=0.931) total time=   0.9s
[CV 3/5] END alpha=5, fit_prior=True;, score=(train=0.969, test=0.969) total time=   0.9s
[CV 4/5] END alpha=5, fit_prior=True;, score=(train=0.969, test=0.969) total time=   0.9s
[CV 5/5] END alpha=5, fit_prior=True;, score=(train=0.969, test=0.970) total time=   0.9s
[CV 1/5] END alpha=10, fit_prior=True;, 

In [None]:
# check the models
pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_train_score"], columns=["Training Mean F1 Score"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["Validation Mean F1 Score"])], axis=1)

Unnamed: 0,alpha,fit_prior,Training Mean F1 Score,Validation Mean F1 Score
0,1,True,0.967009,0.965484
1,5,True,0.962138,0.961342
2,10,True,0.953947,0.953316
3,15,True,0.948425,0.94784


In [None]:
# print the best alpha value and f1 score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

Best Hyperparameters:  {'alpha': 1, 'fit_prior': True}
Best F1 Score:  0.9654838887731548


In [None]:
# predict on the test set
y_pred = grid_search.predict(X_test)

In [None]:
# look at the classification report
report = classification_report(y_test, y_pred, digits=4)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0     0.5268    0.7982    0.6347      1328
           1     0.9979    0.9927    0.9953    130801

    accuracy                         0.9908    132129
   macro avg     0.7624    0.8955    0.8150    132129
weighted avg     0.9932    0.9908    0.9917    132129



In [None]:
# look at the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  1060,    268],
       [   952, 129849]])