In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

### Which Naive Bayes to apply- Bernoulli, Multinomial or Gaussian?
When we only care about a word being present or not in a document, then we use Bernoulli NB, if the frequency of words in a document is of interest and not just if it is present or not, then we use Multinomial NB. Gaussian NB is used when we have continuous real-valued features (values that can take any real values).

For this problem, a given word can take values from a min 0 to a max of (num_of_documents_in_corpus), therefore we should use Multinomial NB.

In [None]:
from sklearn.naive_bayes import MultinomialNB

### Train-test splitter for dataset sorted wrt time

In [None]:
### X and y, before being passed to this function must be converted to numpy array or must be sparse matrices\
### for consistency throughout the program.

### y will be a numpy vector because y-values originally are stored in a column of the original dataframe (ie, it\
### will be of type pd.Series. When converted into np-array, it will become a numpy 1D array, i.e a column vector)

def train_test_splitter(X, y, test_size = 0.2):
    train_size = 1 - test_size
    
    train_row_upper_index = round(train_size*X.shape[0])
    test_row_lower_index = train_row_upper_index + 1
    
    if(X.ndim == 1):
        X = X.reshape((X.shape[0], 1))
    y = y.reshape((y.shape[0], 1)) # y is 
    
    X_train = X[:train_row_upper_index + 1, :]
    X_test = X[test_row_lower_index:, :]
    
    y_train = y[:train_row_upper_index + 1]
    y_test = y[test_row_lower_index:]
    
    return X_train, y_train, X_test, y_test

### Hyperparameter tuning

#### save best NB model
This function can be reused. It can be added to any program to save intermediate best model trained during hyperparameter tuning (although it can save any python object in general).

In [None]:
def save_best_model(model):

    with open('pickle_files/nb_best.pkl', 'wb') as pkl_file:
        pickle.dump(model, pkl_file)

In [None]:
def save_f1_scores_and_y_pred_best(f1scores, y_pred):
    
    with open('pickle_files/f1scores.pkl', 'wb') as pkl_file:
        pickle.dump(f1scores, pkl_file)
        
    with open('pickle_files/y_pred_best.pkl', 'wb') as pkl_file:
        pickle.dump(y_pred, pkl_file)

In [None]:
def plot_f1scores_vs_alpha(f1scores, alphas):
    
    plt.plot(alphas, f1scores)
    
    max_f1score = max(f1scores)
    alpha_corr_to_max_f1score = alphas[f1scores.index(max_f1score)]
    
    f1scores.remove(max_f1score)
    alphas.remove(alpha_corr_to_max_f1score)
    plt.title('F1-scores vs alpha')
    plt.xlabel('alpha')
    plt.ylabel('f1scores')
    plt.scatter(alphas, f1scores, c = 'red')
    
    plt.scatter(alpha_corr_to_max_f1score, max_f1score, c = 'yellow', s = 50, edgecolors = 'black', marker = 'D',\
               label = '(' + str(alpha_corr_to_max_f1score) + ', ' + str(max_f1score) + ')')
    
    plt.legend()
    plt.show()

In [None]:
def plot_conf_matrix(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_df = pd.DataFrame(conf_matrix)
    
    sns.heatmap(conf_matrix_df, annot = True)
    plt.show()

#### tune_alpha (laplace smoothing parameter, pseudocount)

In [None]:
def tune_alpha(alphas, X_train, y_train, X_cv, y_cv):
    
    best_alpha = None
    precisions = []
    recalls = []
    f1scores =[]
    
    f1score_max = 0
    best_alpha = None
    y_pred_best = None
    best_multinomial_nb = None
    
    if not os.path.exists('pickle_files'):
        os.mkdir('pickle_files')
    
    if os.path.exists('pickle_files/nb_best.pkl'):
        with open('pickle_files/nb_best.pkl', 'rb') as pkl_file:
            multinomial_nb = pickle.load(pkl_file)
            
        ### if the model was already present, this means the f1scores and y_pred_best also must have been saved as well
        with open('pickle_files/f1scores.pkl', 'rb') as pkl_file:
            f1scores = pickle.load(pkl_file)
        
        with open('pickle_files/y_pred_best.pkl', 'rb') as pkl_file:
            y_pred_best = pickle.load(pkl_file)
        
        plot_f1scores_vs_alpha(f1scores, alphas)
        plot_conf_matrix(y_cv, y_pred_best)
        
        return multinomial_nb
    
    for alpha in alphas:
        multinomial_nb = MultinomialNB(alpha = alpha, fit_prior = True, class_prior = None)
        multinomial_nb = multinomial_nb.fit(X_train, y_train.ravel())
        
        y_pred = multinomial_nb.predict(X_cv)
        
        precisions.append(precision_score(y_cv, y_pred))
        recalls.append(recall_score(y_cv, y_pred))
        
        f1score = f1_score(y_cv, y_pred)
        f1scores.append(f1score)
        
        if(f1score > f1score_max):
            f1score_max = f1score
            y_pred_best = y_pred ### for building the confusion matrix
            best_alpha = alpha
            
            ### this function will overwrite the previous best model
            best_multinomial_nb = multinomial_nb
            save_best_model(best_multinomial_nb)
            
    save_f1_scores_and_y_pred_best(f1scores, y_pred_best)
    plot_f1scores_vs_alpha(f1scores, alphas)
    plot_conf_matrix(y_cv, y_pred_best)
    
    return best_multinomial_nb

### Import dataset

In [None]:
import sqlite3

In [None]:
db_connection = sqlite3.connect('database.sqlite')

In [None]:
df = pd.read_sql_query('select * from reviews where Score != 3', db_connection)

In [None]:
df.shape

In [None]:
scores = df['Score']

### Replacing the ratings with 0 (for negative reviews) and 1 (for positive reviews).
 Score of  >3 has been considered as positive and a score of <3 has been taken as negative

In [None]:
scores[:6]

In [None]:
scores = list(map(lambda x: 0 if x < 3 else 1, scores))

In [None]:
scores[:6]

In [None]:
df['Score'] = scores

### Data preprocessing

##### 1. Deduplication
If a user id has multiple entries for the same timestamp, then it should be removed because it is likely that multiple entries at the same timestamp were for the same product of different variety which has a different product id than other variants


In [None]:
df.duplicated(subset = ['UserId', 'Time']).sum()

In [None]:
deduplicated_df = df.drop_duplicates(subset = ['UserId', 'Time'], inplace = False, keep = 'first')

In [None]:
deduplicated_df.shape

#### 2. Taking first 60k rows (only for the purpose of finishing this assignment assignment) after sorting wrt Time

In [None]:
df = deduplicated_df.sort_values(by = 'Time')

In [None]:
df = df.iloc[:100000, :]

In [None]:
scores = df['Score']

#### 3. Extracting the data needed (corpus) and removing html and punctuations

In [None]:
corpus = df['Text']

In [None]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    sentence = re.sub(html_tag_re_obj, ' ', sentence)
#     amps_re = re.compile('&.+')
#     sentence = re.sub(amps_re, ' ', sentence)
    return sentence

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [None]:
cleaned_corpus = []
for doc in corpus:
    cleaned_doc_1 = remove_html(doc)
    cleaned_doc_2 = remove_punctuations(doc)
    cleaned_corpus.append(cleaned_doc_2)

#### 3. Removing stop words

In [None]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in cleaned_corpus:
    if "not" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "didn't" in doc:
        count += 1

print(count)

Setting the corpus to cleaned_corpus

In [None]:
corpus = cleaned_corpus

In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords = stopwords.words('english')

In [None]:
stopwords = set(stopwords)

In [None]:
stopwords.remove('not')

In [None]:
'not' in stopwords

In [None]:
corpus = list(map(lambda doc: doc.lower(), corpus))

In [None]:
corpus[:1]

In [None]:
### filtered_corpus = corpus with docs having no stop words
### using lambda expression for this

filtered_corpus = list(map(lambda doc: ' '.join(list(filter(lambda word: True if word not in stopwords else False\
                                                            , doc.split()))), corpus))

In [None]:
filtered_corpus[:2]

#### 4. Stemming the words (SnowballStemmer)

In [None]:
from nltk.stem import SnowballStemmer

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
stemmed_filtered_corpus = list(map(lambda doc: ' '.join(list(map(stemmer.stem, doc.split()))), filtered_corpus))

In [None]:
stemmed_filtered_corpus[:3]

In [None]:
corpus = stemmed_filtered_corpus

### Splitting into train, cv and test sets

In [None]:
type(corpus)

In [None]:
### train_test splitter takes only numpy arrays and sparse matrices as arguments
corpus = np.array(corpus)
scores = np.array(scores)

In [None]:
X_train_nought, y_train_nought, X_test, y_test = train_test_splitter(corpus, scores, test_size = 0.2)

In [None]:
X_train, y_train, X_cv, y_cv = train_test_splitter(X_train_nought, y_train_nought, test_size = 0.2)

In [None]:
print('X_train: ' + str(type(X_train)), 'y_train: ' + str(type(y_train)), 'X_cv: '+str(type(X_cv)), \
      'y_cv: ' + str(type(y_cv)))

In [None]:
X_train.shape

### 1. Bag of Words (CountVectorizer)
##### Note: Vectorization must be done on training set only, not on test set or the cross validation set!

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
if not os.path.exists('pickle_files/bow_pickles'):
    os.mkdir('pickle_files/bow_pickles')
    
if os.path.exists('pickle_files/bow_pickles/document_term_matrix.pkl'):
    with open('pickle_files/bow_pickles/document_term_matrix.pkl', 'rb') as dtm_pickle:
        X_train_bow = pickle.load(dtm_pickle)
    with open('pickle_files/bow_pickles/count_vectorizer.pkl', 'rb') as vectorizer:
        count_vectorizer = pickle.load(vectorizer)
        
else:
    count_vectorizer = CountVectorizer()
    
#     fit() method takes 1D array (m,). train_test_splitter() returns (m,n) array. ravel() converts it into (m,)
    count_vectorizer = count_vectorizer.fit(X_train.ravel())
    X_train_bow = count_vectorizer.transform(X_train.ravel()) # document_term_matrix is saved as X_train_bow
    with open('pickle_files/bow_pickles/document_term_matrix.pkl', 'wb') as dtm_pickle:
        pickle.dump(X_train_bow, dtm_pickle)
    with open('pickle_files/bow_pickles/count_vectorizer.pkl', 'wb') as vectorizer:
        pickle.dump(count_vectorizer, vectorizer)

In [None]:
type(X_train_bow)

In [None]:
X_train_bow.shape

### 2. TfIdf (TfIdfVectorizer)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
if not os.path.exists('pickle_files/tfidf_pickles'):
    os.mkdir('pickle_files/tfidf_pickles')
    
if os.path.exists('pickle_files/tfidf_pickles/document_term_matrix.pkl'):
    with open('pickle_files/tfidf_pickles/document_term_matrix.pkl', 'rb') as dtm_pickle:
        X_train_tfidf = pickle.load(dtm_pickle)
    with open('pickle_files/tfidf_pickles/tfidf_vectorizer.pkl', 'rb') as vectorizer:
        tfidf_vectorizer = pickle.load(vectorizer)
        
else:
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(X_train.ravel())
    X_train_tfidf = tfidf_vectorizer.transform(X_train.ravel())
    with open('pickle_files/tfidf_pickles/document_term_matrix.pkl', 'wb') as dtm_pickle:
        pickle.dump(X_train_tfidf, dtm_pickle)
    with open('pickle_files/tfidf_pickles/tfidf_vectorizer.pkl', 'wb') as vectorizer:
        pickle.dump(tfidf_vectorizer, vectorizer)

In [None]:
type(X_train_tfidf)

In [None]:
X_train_tfidf.shape

### Naive Bayes for BoW (CountVectorizer)

In [None]:
X_cv_bow = count_vectorizer.transform(X_cv.ravel())
X_test_bow = count_vectorizer.transform(X_test.ravel())

In [None]:
print(y_train.shape, y_cv.shape)

In [None]:
nb_model = tune_alpha(alphas = [0.00001, 0.0001, 0.001, 0.01, 1, 10, 100], \
                   X_train = X_train_bow, y_train = y_train, X_cv = X_cv_bow, y_cv = y_cv)

#### Finding TRP, FPR, TNR, FNR

In [None]:
### loading the y_pred predicted by the best alpha value (saved as pickle by tune_alpha() method)
with open('pickle_files/y_pred_best.pkl', 'rb') as pkl_file:
    y_pred_best = pickle.load(pkl_file)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_cv, y_pred_best).ravel()

In [None]:
print(tn, fp, fn, tp)

In [None]:
tpr = tp/(tp+fn) # true_positive_predicted per acutal_positive
fpr = fp/(tn+fp) # false_positive_predicted per actual_negative
fnr = fn/(tp+fn) # false_negative_prediceted per actual_positive
tnr = tn/(tn+fp) # true_negative_predicted per actual_negative

##### print('tpr: ' + str(tpr))
print('fpr: ' + str(fpr))
print('fnr: ' + str(fnr))
print('tnr: ' + str(tnr))