In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from os import listdir
from os.path import join
from sklearn.model_selection import train_test_split
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [6]:
# import data from google drive
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/BT5151 + BT5153 /BT5153 - Fake Review Detection/fake reviews dataset.csv')
label_mapping = {"CG": 1, "OR": 0}

# Map labels to numerical values
df['label'] = [label_mapping[label] for label in df['label']]
df.head(3)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,1,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,1,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,1,This pillow saved my back. I love the look and...


In [7]:
sentences = df['text_']
label = df['label']

# Train-test-split
X_train, X_val, y_train, y_val = train_test_split(sentences, label, test_size=0.2, stratify=label, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, stratify=y_val, random_state=42)

In [8]:
vectorizer = TfidfVectorizer()

# Fit and transform the text features
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)
X_test_transformed = vectorizer.transform(X_test)

In [15]:
# Evaluate the model based on accuray, precision, recall, F1 and ROC score
def evaluate_model(x_test, y_true, model,modelname):
  from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score ,classification_report, RocCurveDisplay
  y_pred = model.predict(x_test)
  print(modelname)
  accuracy = accuracy_score(y_true, y_pred)
  print('Accuracy: {:.3f}'.format(accuracy))
  precision = precision_score(y_true, y_pred, average = 'binary')
  print('Precision: {:.3f}'.format(precision))
  recall = recall_score(y_true, y_pred, average='binary')
  print('Recall: {:.3f}'.format( recall))
  f1 = f1_score(y_true, y_pred, average= 'binary')
  print('F1_score: {:.3f}'.format(f1))


  print('**********************************************************')
  print('*****************  Classification Report  ****************')
  print('**********************************************************')
  print(classification_report(y_true,y_pred))


In [16]:
log = LogisticRegression()
modelname = 'Logistic Regression '
log.fit(X_train_transformed, y_train)
evaluate_model(X_val_transformed, y_val, log, 'Logistic Regression - Validation')

Logistic Regression - Validation
Accuracy: 0.907
Precision: 0.915
Recall: 0.898
F1_score: 0.907
**********************************************************
*****************  Classification Report  ****************
**********************************************************
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2022
           1       0.92      0.90      0.91      2021

    accuracy                           0.91      4043
   macro avg       0.91      0.91      0.91      4043
weighted avg       0.91      0.91      0.91      4043



In [17]:
evaluate_model(X_test_transformed, y_test, log, 'Logistic Regression - Test')

Logistic Regression - Test
Accuracy: 0.911
Precision: 0.922
Recall: 0.899
F1_score: 0.910
**********************************************************
*****************  Classification Report  ****************
**********************************************************
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2022
           1       0.92      0.90      0.91      2022

    accuracy                           0.91      4044
   macro avg       0.91      0.91      0.91      4044
weighted avg       0.91      0.91      0.91      4044

