In [9]:
import pandas as pd
import pickle

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, f1_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

import warnings
warnings.filterwarnings('ignore')

In [10]:
df_reviews = pd.read_json('../../../data/processed/reviews.json.gz', orient="records", compression="gzip")

In [11]:
x = df_reviews[['cleaned_review']]
y = df_reviews[['sentiment']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [12]:
print("Distribution of +/- review sentiment: \n{}".format(y_train.value_counts(normalize=True)))

Distribution of +/- review sentiment: 
sentiment
1            0.883957
0            0.116043
dtype: float64


In [13]:
x_train_final = x_train['cleaned_review'].values
x_test_final = x_test['cleaned_review'].values
y_train_final = y_train['sentiment'].values
y_test_final = y_test['sentiment'].values

In [14]:
vectorizers = {
    'tfidf_vectorizer': 
    {
        'model': TfidfVectorizer(ngram_range=(1, 2)), 
        'search_spaces': 
        [
            {
                'vect__min_df': Categorical([3]),
                'vect__max_df': Categorical([0.99]),
                'vect__sublinear_tf': Categorical([True]),
                'vect__max_features': Categorical([12000])
            }
        ]
    },
    'count_vectorizer': 
    {
        'model': CountVectorizer(ngram_range=(1, 2)),
        'search_spaces': 
        [
            {
                'vect__min_df': Categorical([3]),
                'vect__max_df': Categorical([0.99]),
                'vect__max_features': Categorical([12000])
            }     
        ]
    } 
}

classifiers = {
    'logistic_regression': 
    {
        'model': LogisticRegression(), 
        'search_spaces': 
        [
            {
                'clf__C': Real(1e-6, 1e+6, prior='log-uniform'),
                'clf__penalty': Categorical(['l2']),
                'clf__solver': Categorical(['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']),
                'clf__max_iter': Integer(low=100, high=1000),
            },
            {
                'clf__C': Real(1e-6, 1e+6, prior='log-uniform'),
                'clf__penalty': Categorical(['l1', 'l2']),
                'clf__solver': Categorical(['liblinear']),
                'clf__max_iter': Integer(low=100, high=1000),
            }
        ]
    },
    'linear_svc':
    {
        'model': LinearSVC(),
        'search_spaces':
        [
            {
                'clf__C': Real(1e-6, 1e+6, 'log-uniform'),
                'clf__penalty': Categorical(['l2']),
                'clf__loss': Categorical(['squared_hinge']),
                'clf__dual': Categorical([True, False]),
                'clf__tol': Real(1e-6, 1e-2, 'log-uniform'),
                'clf__class_weight': Categorical(['balanced']),
                'clf__max_iter': Integer(low=100, high=1000),
            },
            {
                'clf__C': Real(1e-6, 1e+6, 'log-uniform'),
                'clf__penalty': Categorical(['l1']),
                'clf__loss': Categorical(['squared_hinge']),
                'clf__dual': Categorical([False]),
                'clf__tol': Real(1e-6, 1e-2, 'log-uniform'),
                'clf__class_weight': Categorical(['balanced']),
                'clf__max_iter': Integer(low=100, high=1000),
            },
            {
                'clf__C': Real(1e-6, 1e+6, 'log-uniform'),
                'clf__penalty': Categorical(['l2']),
                'clf__loss': Categorical(['hinge']),
                'clf__dual': Categorical([True]),
                'clf__tol': Real(1e-6, 1e-2, 'log-uniform'),
                'clf__class_weight': Categorical(['balanced']),
                'clf__max_iter': Integer(low=100, high=1000),
            },
        ]
    },
    'multinomial_nb': 
    {
        'model': MultinomialNB(), 
        'search_spaces': 
        [
            {
                'clf__alpha': Real(low=1e-6, high=1e+6, prior='log-uniform'),
                'clf__fit_prior': Categorical([True, False])
            }            
        ]
    },
    'decision_tree': 
    {
        'model': DecisionTreeClassifier(), 
        'search_spaces': 
        [
            {
                'clf__criterion': Categorical(['gini', 'entropy']),
                'clf__max_depth': Integer(5, 50),
                'clf__min_samples_split': Integer(2, 20),
                'clf__min_samples_leaf': Integer(1, 10)
            }
        ]
    },
    'random_forest': 
    {
        'model': RandomForestClassifier(), 
        'search_spaces': 
        [
            {
                'clf__n_estimators': Integer(50, 500),
                'clf__max_depth': Integer(5, 50),
                'clf__max_features': Integer(5, 50),
                'clf__min_samples_split': Integer(2, 20),
                'clf__min_samples_leaf': Integer(1, 10)
            }            
        ]
    },
}

In [15]:
def create_pipelines(vectorizers = vectorizers, classifiers = classifiers):
    pipelines = {}

    for c_key in classifiers:
        for v_key in vectorizers:
            pipeline = imbPipeline([
                ('vect', vectorizers[v_key]['model']), 
                ('sampler', RandomOverSampler(random_state=42)),
                ('clf', classifiers[c_key]['model'])
            ])
            
            search_spaces = []
            for v_item in vectorizers[v_key]['search_spaces']:
                for e_item in classifiers[c_key]['search_spaces']:
                    search_spaces.append(v_item | e_item)
                    
            pipelines[f'{c_key}_with_{v_key}'] = {
                'pipeline': pipeline,
                'search_spaces': search_spaces
            }

    return pipelines

def comparison_table(weighted_avg_f1_score):
    df_model = pd.DataFrame(index=weighted_avg_f1_score.keys(), columns=['weighted_avg_f1_score'])
    df_model['weighted_avg_f1_score'] = weighted_avg_f1_score.values()
    return df_model

In [16]:
pipelines = create_pipelines()

In [17]:
f1_weighted_score_val_dict = {}
f1_weighted_score_test_dict = {}

cv=StratifiedKFold(n_splits=5)

for key in pipelines:
    pipeline = pipelines[key]['pipeline']
    search_spaces = pipelines[key]['search_spaces']
    
    search = BayesSearchCV(
                estimator=pipeline, 
                search_spaces=search_spaces, 
                scoring='f1_weighted',
                cv=cv, n_iter=10, n_jobs=-1, verbose=3, random_state=42
            )
    
    print(f'Train Model [{key}]:')
    print(f'Pipeline:\n{pipeline}')
    print(f'Search Spaces:\n{search_spaces}')
    search.fit(x_train_final, y_train_final)
    
    print(f"\nBest hyperparameters: {search.best_params_}")
    print(f"Best estimator: {search.best_estimator_}")
    print(f"Best score: {search.best_score_}")

    print(f'\nFeatures extracted:')
    feature_names = search.best_estimator_.named_steps['vect'].get_feature_names_out()
    print(f'Features: {feature_names}')
    print(f'Length: {len(feature_names)}')

    f1_weighted_score_val = search.best_score_
    f1_weighted_score_val_dict[key] = f1_weighted_score_val
    print(f'\nEvaluation using cross validation (validation set):')
    print(f'weighted average f1 score: {f1_weighted_score_val}')
    
    y_pred = search.predict(x_test_final)
    f1_weighted_score_test = f1_score(y_test_final, y_pred, average='weighted')
    f1_weighted_score_test_dict[key] = f1_weighted_score_test
    
    print(f'\nEvaluation using hold-out validation (test set):')
    print(f'weighted average f1 score: {f1_weighted_score_test}')
    
    print('\nClassification report:')
    print(classification_report(y_test_final, y_pred, labels=[0, 1]))
    
    pickle.dump(
        search.best_estimator_, 
        open(f'../../../models/sentiment_analysis/hyperparameter_tuning_and_oversampling/{key}.pkl', 'wb')
    )
    
    print(f'------------------------------------------------\n')

Train Model [logistic_regression_with_tfidf_vectorizer]:
Pipeline:
Pipeline(steps=[('vect', TfidfVectorizer(ngram_range=(1, 2))),
                ('sampler', RandomOverSampler(random_state=42)),
                ('clf', LogisticRegression())])
Search Spaces:
[{'vect__min_df': Categorical(categories=(3,), prior=None), 'vect__max_df': Categorical(categories=(0.99,), prior=None), 'vect__sublinear_tf': Categorical(categories=(True,), prior=None), 'vect__max_features': Categorical(categories=(12000,), prior=None), 'clf__C': Real(low=1e-06, high=1000000.0, prior='log-uniform', transform='identity'), 'clf__penalty': Categorical(categories=('l2',), prior=None), 'clf__solver': Categorical(categories=('lbfgs', 'newton-cg', 'newton-cholesky', 'sag'), prior=None), 'clf__max_iter': Integer(low=100, high=1000, prior='uniform', transform='identity')}, {'vect__min_df': Categorical(categories=(3,), prior=None), 'vect__max_df': Categorical(categories=(0.99,), prior=None), 'vect__sublinear_tf': Categorica

In [18]:
print('Evaluation Metric for Different Models Using Validation Set:')
print(comparison_table(f1_weighted_score_val_dict))

Evaluation Metric for Different Models Using Validation Set:
                                           weighted_avg_f1_score
logistic_regression_with_tfidf_vectorizer               0.925589
logistic_regression_with_count_vectorizer               0.924144
linear_svc_with_tfidf_vectorizer                        0.923077
linear_svc_with_count_vectorizer                        0.920335
multinomial_nb_with_tfidf_vectorizer                    0.903994
multinomial_nb_with_count_vectorizer                    0.911690
decision_tree_with_tfidf_vectorizer                     0.872624
decision_tree_with_count_vectorizer                     0.875377
random_forest_with_tfidf_vectorizer                     0.923524
random_forest_with_count_vectorizer                     0.923587


In [19]:
print('Evaluation Metric for Different Models Using Testing Set:')
print(comparison_table(f1_weighted_score_test_dict))

Evaluation Metric for Different Models Using Testing Set:
                                           weighted_avg_f1_score
logistic_regression_with_tfidf_vectorizer               0.921092
logistic_regression_with_count_vectorizer               0.920522
linear_svc_with_tfidf_vectorizer                        0.919521
linear_svc_with_count_vectorizer                        0.918761
multinomial_nb_with_tfidf_vectorizer                    0.898914
multinomial_nb_with_count_vectorizer                    0.909256
decision_tree_with_tfidf_vectorizer                     0.869793
decision_tree_with_count_vectorizer                     0.875556
random_forest_with_tfidf_vectorizer                     0.919651
random_forest_with_count_vectorizer                     0.918851
