# Detection of Fraud Reviews: Modeling and Evaluation

# Loading libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Reading datasets

## Train dataset

In [2]:
# Path to your CSV file
X_train_merged_path = '/Users/LeeShan/Fake-Reviews-Detection/ShanShan_notebooks/X_train_merged.csv'

y_train_merged_path = '/Users/LeeShan/Fake-Reviews-Detection/ShanShan_notebooks/y_train_merged.csv'

# Read the CSV file
X_train_merged = pd.read_csv(X_train_merged_path)

y_train_merged = pd.read_csv(y_train_merged_path)

Information

In [3]:
X_train_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542931 entries, 0 to 542930
Columns: 114 entries, word_count to year
dtypes: float64(112), int64(2)
memory usage: 472.2 MB


In [4]:
y_train_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542931 entries, 0 to 542930
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   label   542931 non-null  int64
dtypes: int64(1)
memory usage: 4.1 MB


Shape

In [5]:
X_train_merged.shape

(542931, 114)

In [6]:
y_train_merged.shape

(542931, 1)

Columns

In [7]:
X_train_merged.columns

Index(['word_count', 'avg_word_length', 'avg_sentence_length',
       'uppercase_char_count', 'rating', 'Sentiment_Neutral',
       'Sentiment_Positive', 'category_Clothing_Shoes_and_Jewelry_5',
       'category_Electronics_5', 'category_Food_5',
       ...
       'wa', 'wait', 'want', 'wanted', 'way', 'well', 'without', 'worth',
       'would', 'year'],
      dtype='object', length=114)

In [8]:
y_train_merged.columns

Index(['label'], dtype='object')

Structure

In [9]:
# Display the first few rows
X_train_merged.head()

Unnamed: 0,word_count,avg_word_length,avg_sentence_length,uppercase_char_count,rating,Sentiment_Neutral,Sentiment_Positive,category_Clothing_Shoes_and_Jewelry_5,category_Electronics_5,category_Food_5,...,wa,wait,want,wanted,way,well,without,worth,would,year
0,4,7.0,4.0,1,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,159,4.226415,17.666667,17,5.0,0.0,1.0,0.0,0.0,1.0,...,0.163465,0.0,0.0,0.0,0.0,0.104663,0.0,0.0,0.175222,0.0
2,56,4.125,18.666667,9,4.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,171,3.97076,9.5,24,5.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.326243,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,291,4.058419,13.857143,24,5.0,0.0,1.0,0.0,0.0,1.0,...,0.045875,0.0,0.0,0.0,0.0,0.088118,0.0,0.0,0.0,0.0


In [10]:
y_train_merged.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


## Test dataset

In [11]:
# Path to your CSV file
X_test_merged_path = '/Users/LeeShan/Fake-Reviews-Detection/ShanShan_notebooks/X_test_merged.csv'

y_test_merged_path = '/Users/LeeShan/Fake-Reviews-Detection/ShanShan_notebooks/y_test_merged.csv'

# Read the CSV file
X_test_merged = pd.read_csv(X_test_merged_path)

y_test_merged = pd.read_csv(y_test_merged_path)

Information

In [12]:
X_test_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79124 entries, 0 to 79123
Columns: 114 entries, word_count to year
dtypes: float64(112), int64(2)
memory usage: 68.8 MB


In [13]:
y_test_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79124 entries, 0 to 79123
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   label   79124 non-null  int64
dtypes: int64(1)
memory usage: 618.3 KB


Shape

In [14]:
X_test_merged.shape

(79124, 114)

In [15]:
y_test_merged.shape

(79124, 1)

Columns

In [16]:
X_test_merged.columns

Index(['word_count', 'avg_word_length', 'avg_sentence_length',
       'uppercase_char_count', 'rating', 'Sentiment_Neutral',
       'Sentiment_Positive', 'category_Clothing_Shoes_and_Jewelry_5',
       'category_Electronics_5', 'category_Food_5',
       ...
       'wa', 'wait', 'want', 'wanted', 'way', 'well', 'without', 'worth',
       'would', 'year'],
      dtype='object', length=114)

In [17]:
y_test_merged.columns

Index(['label'], dtype='object')

Structure

In [18]:
# Display the first few rows
X_test_merged.head()

Unnamed: 0,word_count,avg_word_length,avg_sentence_length,uppercase_char_count,rating,Sentiment_Neutral,Sentiment_Positive,category_Clothing_Shoes_and_Jewelry_5,category_Electronics_5,category_Food_5,...,wa,wait,want,wanted,way,well,without,worth,would,year
0,161,4.279503,14.636364,16,4.0,0.0,1.0,0.0,0.0,1.0,...,0.055828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38,4.394737,12.666667,4,5.0,0.0,1.0,0.0,0.0,1.0,...,0.140718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,93,3.892473,13.285714,27,5.0,0.0,1.0,0.0,0.0,1.0,...,0.255208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229119
3,61,4.147541,7.625,11,4.0,0.0,1.0,0.0,0.0,1.0,...,0.071309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114656,0.0
4,68,3.897059,11.333333,6,4.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157213,0.0


In [19]:
# Display the first few rows
y_test_merged.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


# Check for missing values

## Train dataset

In [20]:
# Check for missing values
X_train_merged.isnull().sum()

word_count              0
avg_word_length         0
avg_sentence_length     0
uppercase_char_count    0
rating                  0
                       ..
well                    0
without                 0
worth                   0
would                   0
year                    0
Length: 114, dtype: int64

In [21]:
# Check for missing values
y_train_merged.isnull().sum()

label    0
dtype: int64

## Test dataset

In [22]:
# Check for missing values
X_test_merged.isnull().sum()

word_count              0
avg_word_length         0
avg_sentence_length     0
uppercase_char_count    0
rating                  0
                       ..
well                    0
without                 0
worth                   0
would                   0
year                    0
Length: 114, dtype: int64

In [23]:
# Check for missing values
y_test_merged.isnull().sum()

label    0
dtype: int64

# Modeling

## Naive Bayes Model

# Logistic Regression

# Adaboost Model

# Gradient Boosting

# XGBoost

# Random Forest

In [None]:
# TF-IDF: Naive Bayes, Logistic Regression, Gradient Boosting, Adaboost, XGBoost, Random Forest

# word2vec: Logistic Regression, Gradient Boosting, Adaboost, XGBoost, Random Forest

# Tokenization: Bert, Roberta, LSTM

# Fine-tune: RandomizedSearch


# tf-idf, word2vec