In [56]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [57]:
df_reviews = pd.read_json('../../../data/processed/reviews.json.gz', orient="records", compression="gzip")

In [58]:
x = df_reviews[['cleaned_review']]
y = df_reviews[['sentiment']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [59]:
print("Distribution of +/- review sentiment: \n{}".format(y_train.value_counts(normalize=True)))

Distribution of +/- review sentiment: 
sentiment
1            0.883957
0            0.116043
dtype: float64


In [60]:
x_train_final = x_train['cleaned_review'].values
x_test_final = x_test['cleaned_review'].values
y_train_final = y_train['sentiment'].values
y_test_final = y_test['sentiment'].values

In [61]:
vectorizers = {
    'tfidf_vectorizer': TfidfVectorizer(),
    'count_vectorizer': CountVectorizer()
}

classifiers = {
    'logistic_regression': LogisticRegression(),
    'linear_svc': LinearSVC(),
    'multinomial_nb': MultinomialNB(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier()
}

In [62]:
def create_pipelines(vectorizers = vectorizers, classifiers = classifiers):
    pipelines = {}

    for c_key in classifiers:
        for v_key in vectorizers:
            pipelines[f'{c_key}_with_{v_key}'] = Pipeline([
                ('vect', vectorizers[v_key]), 
                ('clf', classifiers[c_key])
            ])

    return pipelines

def comparison_table(weighted_avg_f1_score):
    df_model = pd.DataFrame(index=weighted_avg_f1_score.keys(), columns=['weighted_avg_f1_score'])
    df_model['weighted_avg_f1_score'] = weighted_avg_f1_score.values()
    return df_model

In [63]:
pipelines = create_pipelines()

In [64]:
f1_weighted_score_val_dict = {}
f1_weighted_score_test_dict = {}

cv=StratifiedKFold(n_splits=5)

for key in pipelines:
    search = GridSearchCV(
                estimator=pipelines[key], 
                param_grid={}, 
                scoring='f1_weighted',
                cv=cv, 
                n_jobs=-1
            )
    
    print(f'Train Model [{key}]:')
    search.fit(x_train_final, y_train_final)

    print(f'\nFeatures extracted:')
    feature_names = search.best_estimator_.named_steps['vect'].get_feature_names_out()
    print(f'Features: {feature_names}')
    print(f'Length: {len(feature_names)}')

    f1_weighted_score_val = search.best_score_
    f1_weighted_score_val_dict[key] = f1_weighted_score_val
    print(f'\nEvaluation using cross validation (validation set):')
    print(f'weighted average f1 score: {f1_weighted_score_val}')
    
    y_pred = search.predict(x_test_final)
    f1_weighted_score_test = f1_score(y_test_final, y_pred, average='weighted')
    f1_weighted_score_test_dict[key] = f1_weighted_score_test
    
    print(f'\nEvaluation using hold-out validation (test set):')
    print(f'weighted average f1 score: {f1_weighted_score_test}')
    
    print('\nClassification report:')
    print(classification_report(y_test_final, y_pred, labels=[0, 1]))
    
    pickle.dump(search.best_estimator_, open(f'../../../models/sentiment_analysis/baseline/{key}.pkl', 'wb'))
    print(f'------------------------------------------------\n')

Train Model [logistic_regression_with_tfidf_vectorizer]:

Features extracted:
Features: ['aaa' 'aaa battery' 'ability' ... 'zoom take' 'zoom work' 'zotac']
Length: 11000

Evaluation using cross validation (validation set):
weighted average f1 score: 0.9290844509978136

Evaluation using hold-out validation (test set):
weighted average f1 score: 0.9298239676010923

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.56      0.66      1365
           1       0.94      0.98      0.96     10547

    accuracy                           0.94     11912
   macro avg       0.88      0.77      0.81     11912
weighted avg       0.93      0.94      0.93     11912

------------------------------------------------



In [65]:
print('Evaluation Metric for Different Models Using Validation Set:')
print(comparison_table(f1_weighted_score_val_dict))

Evaluation Metric for Different Models Using Validation Set:
                                           weighted_avg_f1_score
logistic_regression_with_tfidf_vectorizer               0.929084


In [66]:
print('Evaluation Metric for Different Models Using Testing Set:')
print(comparison_table(f1_weighted_score_test_dict))

Evaluation Metric for Different Models Using Testing Set:
                                           weighted_avg_f1_score
logistic_regression_with_tfidf_vectorizer               0.929824
