In [19]:
import pandas as pd
import pickle

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline, make_pipeline

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [20]:
df_reviews = pd.read_json('../../../data/processed/reviews.json.gz', orient="records", compression="gzip")

In [21]:
x = df_reviews[['cleaned_review']]
y = df_reviews[['sentiment']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [22]:
print("Distribution of +/- review sentiment: \n{}".format(y_train.value_counts(normalize=True)))

Distribution of +/- review sentiment: 
sentiment
1            0.883957
0            0.116043
dtype: float64


In [23]:
x_train_final = x_train['cleaned_review'].values
x_test_final = x_test['cleaned_review'].values
y_train_final = y_train['sentiment'].values
y_test_final = y_test['sentiment'].values

In [24]:
vectorizers = {
    'tfidf_vectorizer': TfidfVectorizer(),
    'count_vectorizer': CountVectorizer()
}

classifiers = {
    'logistic_regression': LogisticRegression(),
    'linear_svc': LinearSVC(),
    'multinomial_nb': MultinomialNB(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier()
}

In [25]:
def create_pipelines(vectorizers = vectorizers, classifiers = classifiers):
    pipelines = {}

    for c_key in classifiers:
        for v_key in vectorizers:
            pipelines[f'{c_key}_with_{v_key}'] = imbPipeline([
                ('vect', vectorizers[v_key]), 
                ('sampler', RandomOverSampler(random_state=42)),
                ('clf', classifiers[c_key]),
            ])

    return pipelines

def comparison_table(weighted_avg_f1_score):
    df_model = pd.DataFrame(index=weighted_avg_f1_score.keys(), columns=['weighted_avg_f1_score'])
    df_model['weighted_avg_f1_score'] = weighted_avg_f1_score.values()
    return df_model

In [26]:
pipelines = create_pipelines()

In [27]:
f1_weighted_score_val_dict = {}
f1_weighted_score_test_dict = {}

cv=StratifiedKFold(n_splits=5)

for key in pipelines:
    search = GridSearchCV(
                estimator=pipelines[key], 
                param_grid={}, 
                scoring='f1_weighted',
                cv=cv, 
                n_jobs=-1
            )
    
    print(f'Train Model [{key}]:')
    search.fit(x_train_final, y_train_final)

    print(f'\nFeatures extracted:')
    feature_names = search.best_estimator_.named_steps['vect'].get_feature_names_out()
    print(f'Features: {feature_names}')
    print(f'Length: {len(feature_names)}')

    f1_weighted_score_val = search.best_score_
    f1_weighted_score_val_dict[key] = f1_weighted_score_val
    print(f'\nEvaluation using cross validation (validation set):')
    print(f'weighted average f1 score: {f1_weighted_score_val}')
    
    y_pred = search.predict(x_test_final)
    f1_weighted_score_test = f1_score(y_test_final, y_pred, average='weighted')
    f1_weighted_score_test_dict[key] = f1_weighted_score_test
    
    print(f'\nEvaluation using hold-out validation (test set):')
    print(f'weighted average f1 score: {f1_weighted_score_test}')
    
    print('\nClassification report:')
    print(classification_report(y_test_final, y_pred, labels=[0, 1]))
    
    pickle.dump(search.best_estimator_, open(f'../../../models/sentiment_analysis/oversampling/{key}.pkl', 'wb'))
    print(f'------------------------------------------------\n')

Train Model [logistic_regression_with_tfidf_vectorizer]:

Features extracted:
Features: ['aa' 'aaa' 'aaaa' ... 'zwift' 'zynga' 'zzzz']
Length: 26367

Evaluation using cross validation (validation set):
weighted average f1 score: 0.9123015979709894

Evaluation using hold-out validation (test set):
weighted average f1 score: 0.9096706708018325

Classification report:
              precision    recall  f1-score   support

           0       0.54      0.86      0.66      1365
           1       0.98      0.91      0.94     10547

    accuracy                           0.90     11912
   macro avg       0.76      0.88      0.80     11912
weighted avg       0.93      0.90      0.91     11912

------------------------------------------------

Train Model [logistic_regression_with_count_vectorizer]:

Features extracted:
Features: ['aa' 'aaa' 'aaaa' ... 'zwift' 'zynga' 'zzzz']
Length: 26367

Evaluation using cross validation (validation set):
weighted average f1 score: 0.916402666596046

Evaluat

In [28]:
print('Evaluation Metric for Different Models Using Validation Set:')
print(comparison_table(f1_weighted_score_val_dict))

Evaluation Metric for Different Models Using Validation Set:
                                           weighted_avg_f1_score
logistic_regression_with_tfidf_vectorizer               0.912302
logistic_regression_with_count_vectorizer               0.916403
linear_svc_with_tfidf_vectorizer                        0.914950
linear_svc_with_count_vectorizer                        0.910659
multinomial_nb_with_tfidf_vectorizer                    0.880967
multinomial_nb_with_count_vectorizer                    0.900874
decision_tree_with_tfidf_vectorizer                     0.875440
decision_tree_with_count_vectorizer                     0.875319
random_forest_with_tfidf_vectorizer                     0.911821
random_forest_with_count_vectorizer                     0.907507


In [29]:
print('Evaluation Metric for Different Models Using Testing Set:')
print(comparison_table(f1_weighted_score_test_dict))

Evaluation Metric for Different Models Using Testing Set:
                                           weighted_avg_f1_score
logistic_regression_with_tfidf_vectorizer               0.909671
logistic_regression_with_count_vectorizer               0.913053
linear_svc_with_tfidf_vectorizer                        0.911353
linear_svc_with_count_vectorizer                        0.910208
multinomial_nb_with_tfidf_vectorizer                    0.881370
multinomial_nb_with_count_vectorizer                    0.901339
decision_tree_with_tfidf_vectorizer                     0.871453
decision_tree_with_count_vectorizer                     0.873740
random_forest_with_tfidf_vectorizer                     0.910086
random_forest_with_count_vectorizer                     0.906533
