##### Import Library

In [None]:
import pandas as pd
import numpy as np

import pickle
import matplotlib.pyplot as plt

In [None]:
from imblearn import over_sampling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()
    plt.show()
    print("sensitivity (true positive): {}".format(cal_sensitivity(cm)))
    print("specificity (true negative): {}".format(cal_specificity(cm)))

def cal_sensitivity(cm):
    FN = cm[1, 0]
    TP = cm[1, 1]
    return round(TP/float(FN + TP), 2)

def cal_specificity(cm):
    TN = cm[0, 0]
    FP = cm[0, 1]
    return round(TN / float(TN + FP), 2)

def evaluate_model(y_true, y_pred, labels):
    print("Classification Report:\n")
    print(classification_report(y_true, y_pred))
    print()
    print("Confusion Matrix:\n")
    plot_confusion_matrix(y_true, y_pred, labels)

##### Read Data

In [None]:
df_reviews = pd.read_json('../../../data/processed/reviews.json.gz', orient="records", compression="gzip")

In [None]:
df_reviews.head()

##### Splitting Dataset into Train and Test Set

In [None]:
x = df_reviews[['cleaned_review']]
y = df_reviews[['sentiment']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

##### Handle Imbalance Data

In [None]:
print("Distribution of +/- review sentiment before oversampling: \n{}".format(y_train.value_counts(normalize=True)))

In [None]:
ros = over_sampling.RandomOverSampler(random_state=42)

x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)

In [None]:
print("Distribution of +/- review sentiment after oversampling: \n{}".format(y_train_resampled.value_counts(normalize=True)))

##### Feature Extraction + Modeling

In [None]:
x_train_final = x_train_resampled['cleaned_review'].values
x_test_final = x_test['cleaned_review'].values
y_train_final = y_train_resampled['sentiment'].values
y_test_final = y_test['sentiment'].values

In [None]:
def create_pipelines_and_search_spaces(vectorizers, estimators):
    pipelines = []
    search_spaces = []

    for v_key in vectorizers:
        for e_key in estimators:
            # add to pipelines
            pipelines.append(Pipeline([
                ('vect', vectorizers[v_key]['model']), 
                ('clf', estimators[e_key]['model'])
            ]))
            
            #add to param grids
            search_space = []
            for v_item in vectorizers[v_key]['search_spaces']:
                for e_item in estimators[e_key]['search_spaces']:
                    search_space.append(v_item | e_item)
            
            search_spaces.append(search_space)
    
    return pipelines, search_spaces

In [None]:
def train_models(pipelines, search_spaces, x_train, y_train, x_test, y_test):
    for i, pipeline in enumerate(pipelines):
        vect_name = pipeline.named_steps['vect'].__class__.__name__
        clf_name = pipeline.named_steps['clf'].__class__.__name__
        name = f'{clf_name} with {vect_name}'
        
        # Train each model using grid search and 5-fold cross-validation
        print(f'Training {name}...')
        print(pipeline)
        print(search_spaces[i])
        bayes_search = BayesSearchCV(estimator=pipeline, search_spaces=search_spaces[i], cv=5, n_iter=20, n_jobs=-1, verbose=3)
        bayes_search.fit(x_train, y_train)
        print(f'{name} trained.\n')
        
        # Evaluate the model on the testing set
        print(f"Best hyperparameters for {name}: {bayes_search.best_params_}")
        print(f"Best estimator: {bayes_search.best_estimator_}")
        print(f"Best score: {bayes_search.best_score_}")
        y_pred = bayes_search.predict(x_test)
        print('Saving the best model...')
        pickle.dump(bayes_search.best_estimator_, open(f'../../../models/sentiment_analysis/{clf_name}_with_{vect_name}.pkl', 'wb'))
        evaluate_model(y_test, y_pred, labels=[0, 1])
        print('---\n')
        

In [None]:
vectorizers = {
    'TfidfVectorizer': 
    {
        'model': TfidfVectorizer(ngram_range=(1, 1)), 
        'search_spaces': 
        [
            {
                'vect__max_features': Integer(low=1000, high=5000),
                'vect__stop_words': Categorical(['english']),
                'vect__sublinear_tf': Categorical([True])
            }
        ]
    },
    # 'CountVectorizer': 
    # {
    #     'model': CountVectorizer(ngram_range=(1, 1)), 
    #     'search_spaces': [
    #         {
    #             'vect__max_features': Integer(low=1000, high=5000),
    #             'vect__stop_words': Categorical(['english'])
    #         }            
    #     ]
    # } 
}

estimators = {
    'LogisticRegression': 
    {
        'model': LogisticRegression(), 
        'search_spaces': 
        [
            # {
            #     'clf__C': Real(1e-6, 1e+6, prior='log-uniform'),
            #     'clf__penalty': Categorical(['l2', None]),
            #     'clf__solver': Categorical(['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']),
            #     'clf__max_iter': Integer(low=100, high=1000),
            # },
            {
                'clf__C': Real(1e-6, 1e+6, prior='log-uniform'),
                'clf__penalty': Categorical(['l1', 'l2']),
                'clf__solver': Categorical(['liblinear']),
                #'clf__max_iter': Integer(low=100, high=1000),
            },
            # {
            #     'clf__C': Real(1e-6, 1e+6, prior='log-uniform'),
            #     'clf__penalty': Categorical(['elasticnet', 'l1', 'l2', None]),
            #     'clf__solver': Categorical(['saga']),
            #     'clf__max_iter': Integer(low=100, high=1000),
            # }
        ]
    },
    # 'RandomForestClassifier': 
    # {
    #     'model': RandomForestClassifier(), 
    #     'search_spaces': 
    #     [
    #         {
    #             'clf__n_estimators': Integer(50, 500),
    #             'clf__max_depth': Integer(5, 50),
    #             'clf__max_features': Integer(5, 50),
    #             'clf__min_samples_split': Integer(2, 20),
    #             'clf__min_samples_leaf': Integer(1, 10)
    #         }            
    #     ]
    # },
    # 'MultinomialNB': 
    # {
    #     'model': MultinomialNB(), 
    #     'search_spaces': 
    #     [
    #         {
    #             'clf__alpha': Real(low=1e-6, high=1e+6, prior='log-uniform'),
    #             'clf__fit_prior': Categorical([True, False])
    #         }            
    #     ]

    # },
    # 'DecisionTreeClassifier': 
    # {
    #     'model': DecisionTreeClassifier(), 
    #     'search_spaces': 
    #     [
    #         {
    #             'clf__criterion': Categorical(['gini', 'entropy']),
    #             'clf__max_depth': Integer(5, 50),
    #             'clf__min_samples_split': Integer(2, 20),
    #             'clf__min_samples_leaf': Integer(1, 10)
    #         }
    #     ]
    # }
}

In [None]:
pipelines, search_spaces = create_pipelines_and_search_spaces(vectorizers, estimators)
print(pipelines)
print(search_spaces)

In [None]:
train_models(pipelines, search_spaces, x_train_final, y_train_final, x_test_final, y_test_final)