# Identifying Fraudulent Customer Reviews on E-Commerce Platforms Using Machine Learning (Spam Detection)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


# Import all Neccessary Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE # Used for oversampling an imbalanced class
from textblob import TextBlob

## Load the dataset from drive

In [None]:
df = pd.read_csv('amazon_subset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,Star_Rating,Review_Summary,class
0,0,Best phone case ever . Everywhere I go I get a...,5,A++++,1
1,1,ITEM NOT SENT from Blue Top Company in Hong Ko...,1,ITEM NOT SENT!!,0
2,2,Saw this same case at a theme park store for 2...,5,Great product,1
3,3,case fits perfectly and I always gets complime...,5,Perfect,1
4,4,I got this for my 14 year old sister. She lov...,4,Cool purchase.,1


The data was already preprocessed in the last notebook. Here we will just continue with implementing the machine learning models. But before that let's explore the class balance and translate text into machine language using Tfidf Vectorizer.

In [None]:
class_count = df['class'].value_counts()
class_count

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,12590
0,7410


In [None]:
class_count_percentage = df['class'].value_counts(normalize=True)
print(f"The percentage of class 0 is {class_count_percentage[0]*100}%")
print(f"The percentage of class 1 is {class_count_percentage[1]*100: .2f}%")


The percentage of class 0 is 37.05%
The percentage of class 1 is  62.95%


We have an imbalanced class, later we will handle it with some techniques.

In [None]:
df['sentiment_score'] = df['Review_Text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()


Unnamed: 0.1,Unnamed: 0,Review_Text,Star_Rating,Review_Summary,class,sentiment_score
0,0,Best phone case ever . Everywhere I go I get a...,5,A++++,1,1.0
1,1,ITEM NOT SENT from Blue Top Company in Hong Ko...,1,ITEM NOT SENT!!,0,0.041667
2,2,Saw this same case at a theme park store for 2...,5,Great product,1,0.57
3,3,case fits perfectly and I always gets complime...,5,Perfect,1,1.0
4,4,I got this for my 14 year old sister. She lov...,4,Cool purchase.,1,0.15


In [None]:
df[(df['Star_Rating'] > 4) & (df['sentiment_score'] < 0)]

Unnamed: 0.1,Unnamed: 0,Review_Text,Star_Rating,Review_Summary,class,sentiment_score
49,49,Often I need to have to revamp my GPS. This li...,5,Super charger.,1,-0.187500
68,68,Has a flat base that covers the analogue stick...,5,Better than original,1,-0.140000
243,243,"Not sure what everyone is doing, but my phone ...",5,Works Fine,1,-0.058333
291,291,I have a hand held manual that is very similar...,5,Works,1,-0.090152
362,362,I got this as a gift from my uncle for my birt...,5,Great!,1,-0.062500
...,...,...,...,...,...,...
19834,19834,My daughter talked me into buying this phone s...,5,Outstanding,1,-0.143019
19887,19887,This battery goes well with my LG cell phone. ...,5,Nice battery. Works well.,1,-0.100000
19911,19911,Man this phone gets all the stares why 1. the ...,5,The Tightest Phone Out There,1,-0.039458
19925,19925,I recently made the upgrade from the i730 to t...,5,"A Bit Expensive, But Thus Far, Worth It",1,-0.022445


In [None]:
df.at[362, 'Review_Text']

'I got this as a gift from my uncle for my birthday. It keeps me charged up all day long!'

I have used TextBlob to find the sentiment scores for each reviews. But seems like text blob cannot find the best sentiment score for the reviews. the review in the row at index 362 looks a positive sentence but text blob gave it a negative score of -0.62. Let's use another module to get the best sentiment scores. Let's use VADER.

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m81.9/126.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
df['Sentiment_Vader'] = df['Review_Text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
df.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,Star_Rating,Review_Summary,class,sentiment_score,Sentiment_Vader
0,0,Best phone case ever . Everywhere I go I get a...,5,A++++,1,1.0,0.9136
1,1,ITEM NOT SENT from Blue Top Company in Hong Ko...,1,ITEM NOT SENT!!,0,0.041667,-0.418
2,2,Saw this same case at a theme park store for 2...,5,Great product,1,0.57,0.807
3,3,case fits perfectly and I always gets complime...,5,Perfect,1,1.0,0.891
4,4,I got this for my 14 year old sister. She lov...,4,Cool purchase.,1,0.15,0.7997


In [None]:
df.at[362, 'Sentiment_Vader']

np.float64(0.3382)

In [None]:
df[(df['Star_Rating'] > 4) & (df['Sentiment_Vader'] < -0.2)]

Unnamed: 0.1,Unnamed: 0,Review_Text,Star_Rating,Review_Summary,class,sentiment_score,Sentiment_Vader
381,381,Double my power as promised. The battery on th...,5,The S4 is garbage without a PowerBear,1,0.125000,-0.2638
439,439,I was pleased to see the &#34;Nintendo Game Bo...,5,Good Silicone Case,1,0.167308,-0.5789
521,521,Very convenient gadget for emergency use.Not a...,5,Car Charger,1,0.200000,-0.2023
598,598,Car charger for convenient because if you leav...,5,comfortable charger,1,0.000000,-0.4215
602,602,cheap and works. what more can u ask for. no...,5,cheap and works,1,0.104000,-0.2411
...,...,...,...,...,...,...,...
19896,19896,Keeps my HP 3715 running for days. I generally...,5,Extended Battery,1,0.121291,-0.6206
19900,19900,"before i review this, i just want to say, i ha...",5,heres another review,1,0.095918,-0.9387
19911,19911,Man this phone gets all the stares why 1. the ...,5,The Tightest Phone Out There,1,-0.039458,-0.4317
19925,19925,I recently made the upgrade from the i730 to t...,5,"A Bit Expensive, But Thus Far, Worth It",1,-0.022445,-0.2509


In [None]:
df.at[19943, 'Review_Text']

'This holster holds the V220 securely in place.  The only problem might be that it is hard to pull it out of the holster without flipping it open and thus answering the phone.'

# Text preprocessing and vectorization

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
  text = re.sub(r'[^a-zA-Z]', ' ', str(text))
  text = text.lower()
  words = text.split()
  words = [stemmer.stem(word) for word in words if word not in stop_words]
  return ' '.join(words)

In [None]:
df['cleaned_text'] = df['Review_Text'].apply(preprocess)


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,Star_Rating,Review_Summary,class,sentiment_score,Sentiment_Vader,cleaned_text
0,0,Best phone case ever . Everywhere I go I get a...,5,A++++,1,1.0,0.9136,best phone case ever everywher go get ton comp...
1,1,ITEM NOT SENT from Blue Top Company in Hong Ko...,1,ITEM NOT SENT!!,0,0.041667,-0.418,item sent blue top compani hong kong two month...
2,2,Saw this same case at a theme park store for 2...,5,Great product,1,0.57,0.807,saw case theme park store dollar good qualiti ...
3,3,case fits perfectly and I always gets complime...,5,Perfect,1,1.0,0.891,case fit perfectli alway get compliment crack ...
4,4,I got this for my 14 year old sister. She lov...,4,Cool purchase.,1,0.15,0.7997,got year old sister love realli realli complai...


# Apply Tfidf Vectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X = tfidf_vectorizer.fit_transform(df['cleaned_text']).toarray()
y = np.array(df['class'])


# Oversampling Using SMOTE technique

In [None]:
# Create a smote object
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

In [None]:
type(X)

numpy.ndarray

In [None]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
X.shape

(25180, 3000)

In [None]:
y.shape

(25180,)

# Now split data from training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Create logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


In [None]:
# Create SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [None]:
print("Logistic regression results.")
print(classification_report(y_test, lr_model.predict(X_test)))

Logistic regression results.
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      2518
           1       0.85      0.85      0.85      2518

    accuracy                           0.85      5036
   macro avg       0.85      0.85      0.85      5036
weighted avg       0.85      0.85      0.85      5036



In [None]:
print("SVM results.")
print(classification_report(y_test, svm_model.predict(X_test)))

SVM results.
              precision    recall  f1-score   support

           0       0.91      0.86      0.88      2518
           1       0.87      0.91      0.89      2518

    accuracy                           0.89      5036
   macro avg       0.89      0.89      0.89      5036
weighted avg       0.89      0.89      0.89      5036

