# **Preprocessing**

In [1]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer as SIA
import re

In [2]:
df = pd.read_csv(r"Reviews_withURL.csv")
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [3]:
sia = SIA()
df["Sentiment"] = df["Text"].apply(lambda x: 1 if sia.polarity_scores(x)["compound"] > 0 else -1)

In [4]:
def segment_pos_tagging(text):
    
    text = re.sub("[0-9]|br|<|>|com|<br>", "", text, 0, re.MULTILINE)
    words = text.split()
    
    return nltk.tag.pos_tag(words)

df['TextSegment'] = df.Text.apply(segment_pos_tagging)

In [74]:
def reconnect(text_segment):

    pos_tags = ['JJ', 'JJR', 'JJS']
    reconnect_adj = []
    
    for i in range(len(text_segment)):
        if text_segment[i][1] in pos_tags:
            
            adj = text_segment[i][0]
            adj = adj.replace(' ', '').replace(',','').replace(' ','').replace('/', '').replace('_', '')

            if len(adj) >= 4:

                if text_segment[i-1][0] == 'not':
                    reconnect_adj.append(f'not_{adj}')
                
                else:
                    reconnect_adj.append(adj)
            
            else:
                pass
    
    return " ".join(text for text in reconnect_adj)

df['TextAdj'] = df.TextSegment.apply(reconnect)

In [None]:
df[['TextAdj', 'Sentiment', 'HelpfulnessNumerator', 'HelpfulnessDenominator']].to_csv(r"Reviews_withURL_preprocessing_v1.csv", index=False)

In [6]:
df.to_csv(r"Reviews_withURL_preprocessing.csv", index=False)

# **Training**

In [13]:
import pandas as pd

df = pd.read_csv(r"Reviews_withURL_preprocessing_v1.csv")

In [14]:
df = df[df["HelpfulnessDenominator"] > 3]

In [15]:
# df["HelpfulnessDenominator"].value_counts()

In [16]:
df = df[df["HelpfulnessNumerator"] < df["HelpfulnessDenominator"]]
df["HelpfulnessRatio"] = df["HelpfulnessNumerator"] / df["HelpfulnessDenominator"]

In [17]:
def to_helpfulness_class(row):
    threshold = 0.65
    # if row["HelpfulnessRatio"] > threshold:
    #     return "Helpful"
    # else:
    #     return "Unhelpful"
    if row["HelpfulnessRatio"] > threshold and row["Sentiment"] == 1:
        return "HelpfulPos"
    elif row["HelpfulnessRatio"] > threshold and row["Sentiment"] == -1:
        return "HelpfulNeg"
    elif row["HelpfulnessRatio"] <= threshold and row["Sentiment"] == 1:
        return "UnhelpfulPos"
    elif row["HelpfulnessRatio"] <= threshold and row["Sentiment"] == -1:
        return "UnhelpfulNeg"

df['HelpfulnessClass'] = df.apply(to_helpfulness_class, axis=1)

In [18]:
df['HelpfulnessClass'].value_counts()

HelpfulPos      24502
UnhelpfulPos    15748
UnhelpfulNeg     8671
HelpfulNeg       5426
Name: HelpfulnessClass, dtype: int64

In [19]:
df.fillna("", inplace=True)

In [20]:
from sklearn.model_selection import train_test_split as TTS

X = df.TextAdj
y = df.HelpfulnessClass

X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=42)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=0.0001)
# vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=0.0001, max_df=0.99)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [22]:
# vectorizer.get_feature_names_out()

In [23]:
# # # vectorizer.get_feature_names_out().shape
# from imblearn.under_sampling import ClusterCentroids

# cc = ClusterCentroids(random_state=0)
# X_train_resampled, y_train_resampled = cc.fit_resample(X_train_tfidf, y_train)

In [27]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression
import pickle

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_tfidf, y_train)

model_name = '4class.sav'
pickle.dump(rfc, open(model_name, 'wb'))

In [37]:
# y_pred = rfc.predict(X_test_tfidf)

model_name = '4class.sav'

model = pickle.load(open(model_name, 'rb'))
y_pred = model.predict(X_test_tfidf)

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.663968107942349
Confusion Matrix: 
 [[ 601  563  226  234]
 [  48 6478  239  621]
 [  83  615 1394  465]
 [  63 2047  275 2353]]
Classification Report: 
               precision    recall  f1-score   support

  HelpfulNeg       0.76      0.37      0.50      1624
  HelpfulPos       0.67      0.88      0.76      7386
UnhelpfulNeg       0.65      0.55      0.59      2557
UnhelpfulPos       0.64      0.50      0.56      4738

    accuracy                           0.66     16305
   macro avg       0.68      0.57      0.60     16305
weighted avg       0.67      0.66      0.65     16305

Recall Score:  0.663968107942349


In [19]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # X_train, X_test, y_train, y_test = TTS(df["Text"], df["Helpfulness"], test_size=0.3, random_state=42)

# vectorizer = TfidfVectorizer(
#     min_df=0.0001,
#     # ngram_range=(1, 1), 
#     # stop_words='english'
# )
# vectorizer.fit(df["TextAdj"].values)

In [20]:
# for i in vectorizer.get_feature_names_out():
#     print(i)