In [8]:
import pandas as pd

df = pd.read_csv(r"Reviews_withURL_preprocessing.csv")
df.fillna("", inplace=True)
df = df[df["HelpfulnessDenominator"] > 3]
df = df[df["HelpfulnessNumerator"] < df["HelpfulnessDenominator"]]
df["HelpfulnessRatio"] = df["HelpfulnessNumerator"] / df["HelpfulnessDenominator"]

In [2]:
def to_helpfulness_class(row):
    threshold = 0.65

    if row["HelpfulnessRatio"] > threshold and row["Sentiment"] == 1:
        return "HelpfulPos"
    elif row["HelpfulnessRatio"] > threshold and row["Sentiment"] == -1:
        return "HelpfulNeg"
    elif row["HelpfulnessRatio"] <= threshold and row["Sentiment"] == 1:
        return "UnhelpfulPos"
    elif row["HelpfulnessRatio"] <= threshold and row["Sentiment"] == -1:
        return "UnhelpfulNeg"
    
def to_emotion_class(row):
    if row["Sentiment"] == 1:
        return "Pos"
    elif row["Sentiment"] == -1:
        return "Neg"

df['HelpfulnessClass'] = df.apply(to_helpfulness_class, axis=1)
df['EmotionClass'] = df.apply(to_emotion_class, axis=1)

In [15]:
# for i in vectorizer.get_feature_names_out():
#     print(i)

abdominal
able
abnormal
about
above
absent
absinthe
absolute
absolutely
absorb
absorbable
absorbed
absorptive
absurd
abundant
acai
accept
acceptable
accessible
accident
accordingly
account
accountable
accuracy
accurate
accustom
acesulfame
acetyl
acid
acidic
acidophilus
acids
acne
acplished
acquire
acrid
acting
active
activity
actual
actually
acute
ad
add
added
addict
addicted
addicting
addictive
addition
additional
additive
additives
adequate
adhesion
adhesive
adjust
adjustable
admitted
adorable
adrenal
adult
adults
advance
adventurous
adverse
advertise
advertised
advertisement
advertising
advice
advisable
advised
aerogarden
affect
affected
afford
affordable
aforementioned
afraid
african
after
afterall
afterlife
afternoon
aftertaste
afterwards
again
agave
age
ages
aggressive
agility
ago
agree
agreeable
agressive
agricultural
ahead
ahold
aid
ain
ainer
air
airtight
airy
akin
alas
alaskan
alcohol
alcoholic
alergic
alert
algae
alien
alittle
alive
alka
alkali
alkaline
all
allege
allergenic


In [13]:
vectorizer.transform(['The Strawberry Twizzlers are my guilty pleasure - yummy. ']).toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [16]:
df['Text'].iloc[1]

"My daughter loves twizzlers and this shipment of six pounds really hit the spot. It's exactly what you would expect...six packages of strawberry twizzlers."

In [4]:
df['HelpfulnessClass'].value_counts()

HelpfulPos      24502
UnhelpfulPos    15748
UnhelpfulNeg     8671
HelpfulNeg       5426
Name: HelpfulnessClass, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split as TTS
from sklearn.preprocessing import LabelEncoder

X = df.TextAdj
y = df.HelpfulnessClass

lb = LabelEncoder()
y = lb.fit_transform(y)

X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=42)

# **Text to tf-idf**

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=0.0001)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
df

Unnamed: 0,TextAdj,Sentiment,HelpfulnessNumerator,HelpfulnessDenominator,HelpfulnessRatio,HelpfulnessClass,EmotionClass
14,guilty,1,4,5,0.800000,HelpfulPos,Pos
15,,1,4,5,0.800000,HelpfulPos,Pos
50,,-1,0,7,0.000000,UnhelpfulNeg,Neg
60,good,1,2,4,0.500000,UnhelpfulPos,Pos
100,only heavy.,1,4,5,0.800000,HelpfulPos,Pos
...,...,...,...,...,...,...,...
568388,many alternative sugar sugary same regular sub...,1,5,6,0.833333,HelpfulPos,Pos
568403,stick-form thin plastic not_sure open other ea...,1,0,6,0.000000,UnhelpfulPos,Pos
568405,easy taste good.,1,3,5,0.600000,UnhelpfulPos,Pos
568417,best able,1,2,4,0.500000,UnhelpfulPos,Pos


# **Experiment**

In [11]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_helper(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
    print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))

    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
    print("Classification Report: \n", classification_report(y_test, y_pred))

## **E1-4class**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
train_helper(rfc, X_train_tfidf, y_train, X_test_tfidf, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=42)
train_helper(dtc, X_train_tfidf, y_train, X_test_tfidf, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
train_helper(lr, X_train_tfidf, y_train, X_test_tfidf, y_test)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
train_helper(xgb, X_train_tfidf, y_train, X_test_tfidf, y_test)

In [None]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('lr', LogisticRegression(random_state=42)),
    ('xgb', XGBClassifier(random_state=42))
]

stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier(random_state=42)
)

train_helper(stacking_classifier, X_train_tfidf, y_train, X_test_tfidf, y_test)

## **E2-2class-2class**

In [None]:
# from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_22class_helper(model, X_train, X_test, y1_train, y2_train, y1_test, y2_test, y_test):
    # y1 sentiment # y2 helpfulness
    model.fit(X_train, y1_train)
    y1_pred = model.predict(X_test)

    model.fit(X_train, y2_train)
    y2_pred = model.predict(X_test)
    
    y_pred = []

    for y1, y2 in zip(y1_pred, y2_pred):
        
        if y1 == 0 and y2 == 1:
            y_pred.append(0)
        
        elif y1 == 1 and y2 == 1:
            y_pred.append(1)
        
        elif y1 == 0 and y2 == 0:
            y_pred.append(2)
        
        elif y1 == 1 and y2 == 0:
            y_pred.append(3)
    
    # return y_pred, y1_pred, y2_pred
    print("Sentiment ------------------")
    print("Accuracy Score: ", accuracy_score(y1_test, y1_pred))
    print("Precision Score: ", precision_score(y1_test, y1_pred))
    print("Recall Score: ", recall_score(y1_test, y1_pred))
    print("F1 Score: ", f1_score(y1_test, y1_pred))

    print("\nHelpful --------------------")
    print("Accuracy Score: ", accuracy_score(y2_test, y2_pred))
    print("Precision Score: ", precision_score(y2_test, y2_pred))
    print("Recall Score: ", recall_score(y2_test, y2_pred))
    print("F1 Score: ", f1_score(y2_test, y2_pred))

    print("\nOverall --------------------")
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
    print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))

In [None]:
# df.HelpfulnessRatio.apply(lambda x: 1 if x > 0.65 else 0)

In [None]:
# y2_train

In [None]:
y1_train, y1_test, y2_train, y2_test = TTS(df.Sentiment.map({-1: 0, 1: 1}), df.HelpfulnessRatio.apply(lambda x: 1 if x > 0.65 else 0), test_size=0.3, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('xgb', XGBClassifier(random_state=42))
]

stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier(random_state=42)
)

In [None]:
train_22class_helper(LogisticRegression(random_state=42), X_train_tfidf, X_test_tfidf, y1_train, y2_train, y1_test, y2_test, y_test)

In [None]:
train_22class_helper(XGBClassifier(random_state=42), X_train_tfidf, X_test_tfidf, y1_train, y2_train, y1_test, y2_test, y_test)

In [None]:
train_22class_helper(DecisionTreeClassifier(random_state=42), X_train_tfidf, X_test_tfidf, y1_train, y2_train, y1_test, y2_test, y_test)

In [None]:
train_22class_helper(RandomForestClassifier(random_state=42), X_train_tfidf, X_test_tfidf, y1_train, y2_train, y1_test, y2_test, y_test)

In [None]:
train_22class_helper(stacking_classifier, X_train_tfidf, X_test_tfidf, y1_train, y2_train, y1_test, y2_test, y_test)

## **SMOTE**

In [7]:
from imblearn.over_sampling import SMOTE

In [8]:
X_train_tfidf_smote, y_train_smote = SMOTE(random_state=42).fit_resample(X_train_tfidf, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
train_helper(lr, X_train_tfidf_smote, y_train_smote, X_test_tfidf, y_test)

In [40]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
train_helper(xgb, X_train_tfidf_smote, y_train_smote, X_test_tfidf, y_test)

Accuracy Score:  0.5628334866605336
Precision Score:  0.5889905440333771
Recall Score:  0.5628334866605336
F1 Score:  0.5640041978534964
Confusion Matrix: 
 [[ 582  236  642  164]
 [ 361 4879 1067 1079]
 [ 248  219 1828  262]
 [ 281 1434 1135 1888]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.40      0.36      0.38      1624
           1       0.72      0.66      0.69      7386
           2       0.39      0.71      0.51      2557
           3       0.56      0.40      0.46      4738

    accuracy                           0.56     16305
   macro avg       0.52      0.53      0.51     16305
weighted avg       0.59      0.56      0.56     16305



In [41]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=42)
train_helper(dtc, X_train_tfidf_smote, y_train_smote, X_test_tfidf, y_test)

Accuracy Score:  0.6044771542471634
Precision Score:  0.6152836401899787
Recall Score:  0.6044771542471634
F1 Score:  0.6079048194373683
Confusion Matrix: 
 [[ 743  301  342  238]
 [ 440 4912  677 1357]
 [ 267  353 1589  348]
 [ 319 1182  625 2612]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.42      0.46      0.44      1624
           1       0.73      0.67      0.70      7386
           2       0.49      0.62      0.55      2557
           3       0.57      0.55      0.56      4738

    accuracy                           0.60     16305
   macro avg       0.55      0.57      0.56     16305
weighted avg       0.62      0.60      0.61     16305



In [42]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
train_helper(rfc, X_train_tfidf_smote, y_train_smote, X_test_tfidf, y_test)

Accuracy Score:  0.6396810794234897
Precision Score:  0.6406356214274785
Recall Score:  0.6396810794234897
F1 Score:  0.6371481821458082
Confusion Matrix: 
 [[ 716  333  381  194]
 [ 273 5573  564  976]
 [ 227  330 1690  310]
 [ 230 1454  603 2451]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.50      0.44      0.47      1624
           1       0.72      0.75      0.74      7386
           2       0.52      0.66      0.58      2557
           3       0.62      0.52      0.57      4738

    accuracy                           0.64     16305
   macro avg       0.59      0.59      0.59     16305
weighted avg       0.64      0.64      0.64     16305



In [9]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('xgb', XGBClassifier(random_state=42))
]

stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier(random_state=42)
)

In [12]:
train_helper(stacking_classifier, X_train_tfidf_smote, y_train_smote, X_test_tfidf, y_test)

Accuracy Score:  0.6453235203925176
Precision Score:  0.6409211128254181
Recall Score:  0.6453235203925176
F1 Score:  0.6358610079378435
Confusion Matrix: 
 [[ 645  503  294  182]
 [ 143 5988  418  837]
 [ 154  533 1557  313]
 [ 148 1763  495 2332]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.59      0.40      0.48      1624
           1       0.68      0.81      0.74      7386
           2       0.56      0.61      0.59      2557
           3       0.64      0.49      0.56      4738

    accuracy                           0.65     16305
   macro avg       0.62      0.58      0.59     16305
weighted avg       0.64      0.65      0.64     16305

