In [5]:
import pandas as pd

df = pd.read_csv(r"Reviews_withURL_preprocessing_v1.csv")
df = df[df["HelpfulnessDenominator"] > 3]
df.fillna("", inplace=True)

In [6]:
df = df[df["HelpfulnessNumerator"] < df["HelpfulnessDenominator"]]
df["HelpfulnessRatio"] = df["HelpfulnessNumerator"] / df["HelpfulnessDenominator"]

In [7]:
def to_helpfulness_class(row):
    threshold = 0.65
    # if row["HelpfulnessRatio"] > threshold:
    #     return "Helpful"
    # else:
    #     return "Unhelpful"
    if row["HelpfulnessRatio"] > threshold and row["Sentiment"] == 1:
        return "HelpfulPos"
    elif row["HelpfulnessRatio"] > threshold and row["Sentiment"] == -1:
        return "HelpfulNeg"
    elif row["HelpfulnessRatio"] <= threshold and row["Sentiment"] == 1:
        return "UnhelpfulPos"
    elif row["HelpfulnessRatio"] <= threshold and row["Sentiment"] == -1:
        return "UnhelpfulNeg"

df['HelpfulnessClass'] = df.apply(to_helpfulness_class, axis=1)

In [8]:
df['HelpfulnessClass'].value_counts()

HelpfulPos      24502
UnhelpfulPos    15748
UnhelpfulNeg     8671
HelpfulNeg       5426
Name: HelpfulnessClass, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split as TTS

X = df.TextAdj
y = df.HelpfulnessClass

X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=42)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=0.0001)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
from sklearn.ensemble import RandomForestClassifier
import pickle

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_tfidf, y_train)

model_name = '4class.sav'
pickle.dump(rfc, open(model_name, 'wb'))

In [12]:
model_name = '4class.sav'

model = pickle.load(open(model_name, 'rb'))
y_pred = model.predict(X_test_tfidf)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.663968107942349
Confusion Matrix: 
 [[ 601  563  226  234]
 [  48 6478  239  621]
 [  83  615 1394  465]
 [  63 2047  275 2353]]
Classification Report: 
               precision    recall  f1-score   support

  HelpfulNeg       0.76      0.37      0.50      1624
  HelpfulPos       0.67      0.88      0.76      7386
UnhelpfulNeg       0.65      0.55      0.59      2557
UnhelpfulPos       0.64      0.50      0.56      4738

    accuracy                           0.66     16305
   macro avg       0.68      0.57      0.60     16305
weighted avg       0.67      0.66      0.65     16305

Recall Score:  0.663968107942349
