In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error
from xgboost import XGBClassifier, XGBRegressor
import warnings
import random
import matplotlib.pyplot as plt

## Some utility functions to print out reports 

In [2]:
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

def print_regression_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print('Mean absolute error:', mean_absolute_error(y_val, y_pred))
    print('Mean squared error:', mean_squared_error(y_val, y_pred))
    print('Root Mean squared error:', np.sqrt(mean_squared_error(y_val, y_pred)))

def print_classification_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val, y_pred)}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')

def print_multilabel_classification_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val.argmax(axis=1), y_pred.argmax(axis=1))}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')



## Data Preview

In [8]:
# data set preview
data = pd.read_csv('datasets/mcdonald_reviews/amazon_test.csv', encoding='latin-1', header=None)
data.columns = ['rating', 'title', 'review']
print(data.head(5))

   rating                                              title  \
0       2                                           Great CD   
1       2  One of the best game music soundtracks - for a...   
2       1                   Batteries died within a year ...   
3       2              works fine, but Maha Energy is better   
4       2                       Great for the non-audiophile   

                                              review  
0  My lovely Pat has one of the GREAT voices of h...  
1  Despite the fact that I have only played a sma...  
2  I bought this charger in Jul 2003 and it worke...  
3  Check out Maha Energy's website. Their Powerex...  
4  Reviewed quite a bit of the combo players and ...  


In [None]:
def read_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1', header=None)
    data.columns = ['rating', 'title', 'review']
    data.dropna(inplace=True)
    
    label_encoder = LabelEncoder()
    
    X, y = data['title'] + data['review'], data['rating']

    for i in range(len(y)):
        if y[i] <= 2:
            y[i] = 0
        elif y[i] == 3:
            y[i] = 1
        else:
            y[i] = 2

    return X, y, data

## Positive - Negative classification

Here, I stripped all 3-stars labels and only keep 4 and 5 as positive, and 1-2 as negative.
There is another submission that used transformer with 90% accuracy.
Here, we get around > 93% accuracy for this formulation using 5-fold cross validation.

In [42]:
def read_data_positive_negative_only(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)
    mask = data['rating'] != '3 stars'
    filtered_data = data[mask]
    data = filtered_data

    label_encoder = LabelEncoder()
    T_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=10000)

    X = data['review'].values
    y = data['rating'].values

    for i in range(len(y)):
        if y[i] == '1 star':
            y[i] = 0
        elif y[i] == '2 stars':
            y[i] = 0
        elif y[i] == '4 stars':
            y[i] = 1
        elif y[i] == '5 stars':
            y[i] = 1

    X = T_vectorizer.fit_transform(X)
    y = label_encoder.fit_transform(y)
    return X, y

X, y = read_data_positive_negative_only('datasets/mcdonald_reviews/McDonald_s_Reviews.csv')
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# k-fold experiment using XGBoost
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = XGBClassifier(n_estimators=2500, learning_rate=0.3, max_depth=5, colsample_bytree=0.3, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_classification_report(y_val, y_pred, fold)
  fold += 1



Fold: 1
Accuracy Score: 0.9322154834106314
Confusion Matrix: 
 [[2223  194]
 [ 186 3003]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.92      0.92      2417
           1       0.94      0.94      0.94      3189

    accuracy                           0.93      5606
   macro avg       0.93      0.93      0.93      5606
weighted avg       0.93      0.93      0.93      5606

Fold: 2
Accuracy Score: 0.9338209061719586
Confusion Matrix: 
 [[2226  215]
 [ 156 3009]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      2441
           1       0.93      0.95      0.94      3165

    accuracy                           0.93      5606
   macro avg       0.93      0.93      0.93      5606
weighted avg       0.93      0.93      0.93      5606

Fold: 3
Accuracy Score: 0.9341776667855869
Confusion Matrix: 
 [[2253  208]
 [ 161 2984]]
Classification Report: 
  

In [51]:
def read_data_positive_negative_neutral(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)
    custom_token_pattern = r"(?u)\b\w\w+\b|!|\?|\"|\'|;|:|,|\."

    label_encoder = LabelEncoder()
    #T_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=20000)
    T_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=20000)

    X = data['review'].values
    y = data['rating'].values

    for i in range(len(y)):
        if y[i] == '1 star' or y[i] == '2 stars':
            y[i] = -1
        elif y[i] == '3 stars':
            y[i] = 0
        elif y[i] == '4 stars' or y[i] == '5 stars':
            y[i] = 1

    X = T_vectorizer.fit_transform(X)
    y = label_encoder.fit_transform(y)
    return X, y, data

X, y, data = read_data_positive_negative_neutral('datasets/mcdonald_reviews/McDonald_s_Reviews.csv')
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# k-fold experiment using XGBoost
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = XGBClassifier(n_estimators=300, learning_rate=0.5, max_depth=8, n_jobs=-1, random_state=42, colsample_bytree=0.3)
  #model = RandomForestClassifier(n_estimators=200, max_depth=450, n_jobs=-1, random_state=42, max_leaf_nodes=3000)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_classification_report(y_val, y_pred, fold)
  fold += 1

  misclassified = np.where(y_val != y_pred)
  misclassified_labels = y_val[misclassified]
  misclassified_predictions = y_pred[misclassified]
  misclassified_reviews = data['review'].values[misclassified]

  indices = random.sample(range(len(misclassified_labels)), 10)

  for i in indices:
    print(f'Review: {misclassified_reviews[i]}, Actual: {misclassified_labels[i]}, Predicted: {misclassified_predictions[i]}')






Fold: 1
Accuracy Score: 0.8459071472205254
Confusion Matrix: 
 [[2230   80  185]
 [ 206  403  313]
 [ 141   84 2906]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.89      0.88      2495
           1       0.71      0.44      0.54       922
           2       0.85      0.93      0.89      3131

    accuracy                           0.85      6548
   macro avg       0.81      0.75      0.77      6548
weighted avg       0.84      0.85      0.84      6548

Review: Fast service good customer service no complaints, Actual: 2, Predicted: 0
Review: Friendly and efficient staff. There is a methadone clinic down the street so the clientele are a pretty rough crowd., Actual: 1, Predicted: 2
Review: Quick to pickup when you don't have much time. The fountain drinks are the best!, Actual: 1, Predicted: 2
Review: Not open 24/7, Actual: 1, Predicted: 2
Review: So ehm what can I say? Italy is the best for the food of course and the chi

## Classification with all 5 labels
Here we see how well XGBoost gets us by having all 5 labels.

The acccuracy is about 70% which is not bad at all (considering misclassifying between 1 and 2, or 4 and 5 are both penalized).



In [27]:
def read_data_positive_negative_neutral(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)

    label_encoder = LabelEncoder()
    T_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)

    X = data['review'].values
    y = data['rating'].values

    for i in range(len(y)):
        if y[i] == '1 star':
            y[i] = 1
        elif y[i] == '2 stars':
            y[i] = 2
        elif y[i] == '3 stars':
            y[i] = 3
        elif y[i] == '4 stars':
            y[i] = 4
        elif y[i] == '5 stars':
            y[i] = 5

    X = T_vectorizer.fit_transform(X)
    y = label_encoder.fit_transform(y)
    return X, y

X, y = read_data_positive_negative_neutral('datasets/mcdonald_reviews/McDonald_s_Reviews.csv')
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# k-fold experiment using XGBoost
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = XGBClassifier(n_estimators=2500, learning_rate=0.3, max_depth=4, colsample_bytree=0.2, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_classification_report(y_val, y_pred, fold)
  fold += 1



KeyboardInterrupt: 

In [None]:
def read_data_positive_negative_neutral(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)

    label_encoder = LabelEncoder()
    T_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)

    X = data['review'].values
    y = data['rating'].values

    for i in range(len(y)):
        if y[i] == '1 star':
            y[i] = -1
        elif y[i] == '2 stars':
            y[i] = -1
        elif y[i] == '3 stars':
            y[i] = 0
        elif y[i] == '4 stars':
            y[i] = 1
        elif y[i] == '5 stars':
            y[i] = 1

    X = T_vectorizer.fit_transform(X)
    y = label_encoder.fit_transform(y)
    return X, y

X, y = read_data_positive_negative_neutral('datasets/mcdonald_reviews/McDonald_s_Reviews.csv')
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# k-fold experiment using XGBoost
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = XGBClassifier(n_estimators=1000, learning_rate=0.3, max_depth=4, colsample_bytree=0.2, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_classification_report(y_val, y_pred, fold)
  fold += 1



## Regression formulation

Here, we treat this problem as a regression task with output value between 0 and 5.
We get a MAE +/- 0.63. This is not bad at all in the scale of 5. 

In [44]:
def read_data_positive_negative_neutral(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)

    label_encoder = LabelEncoder()
    T_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)

    X = data['review'].values
    y = data['rating'].values

    for i in range(len(y)):
        if y[i] == '1 star':
            y[i] = 1
        elif y[i] == '2 stars':
            y[i] = 2
        elif y[i] == '3 stars':
            y[i] = 3
        elif y[i] == '4 stars':
            y[i] = 4
        elif y[i] == '5 stars':
            y[i] = 5

    X = T_vectorizer.fit_transform(X)
    y = label_encoder.fit_transform(y)
    return X, y

X, y = read_data_positive_negative_neutral('datasets/mcdonald_reviews/McDonald_s_Reviews.csv')
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# k-fold experiment using XGBoost
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  #model = XGBClassifier(n_estimators=2500, learning_rate=0.3, max_depth=4, colsample_bytree=0.2, n_jobs=-1, random_state=42)
  model = XGBRegressor(n_estimators=2500, learning_rate=0.3, max_depth=4, colsample_bytree=0.2, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_regression_report(y_val, y_pred, fold)
  fold += 1   



Fold: 1
Mean absolute error: 0.6254317914566444
Mean squared error: 0.8036038548184976
Root Mean squared error: 0.8964395433148281
Fold: 2
Mean absolute error: 0.6315393408940457
Mean squared error: 0.8037277156520268
Root Mean squared error: 0.8965086255313034
Fold: 3
Mean absolute error: 0.6234922288278544
Mean squared error: 0.7849631240151761
Root Mean squared error: 0.8859814467669038
Fold: 4
Mean absolute error: 0.6337750923112635
Mean squared error: 0.8193241161104878
Root Mean squared error: 0.9051652424339369
Fold: 5
Mean absolute error: 0.6158652616585568
Mean squared error: 0.7841993174187035
Root Mean squared error: 0.885550290733792
