In [5]:
import pandas as pd
import numpy as np

In [6]:
dataset = pd.read_csv('Data//external//fake_reviews_dataset.csv')

In [7]:
# Shape of dataset
dataset.shape

(40526, 4)

In [8]:
# Checking if any null values
dataset.isna().sum()

Unnamed: 0,0
category,0
rating,0
text,0
label,0


In [9]:
# checking for duplicates
dataset.duplicated(subset = ['text' , 'label']).sum()

33

In [10]:
# dropping duplicates
dataset.drop_duplicates(inplace = True)

In [11]:
# Checking the distribution of label
dataset['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,20270
0,20231


In [12]:
# cheking the head
dataset.head()

Unnamed: 0,category,rating,text,label
0,Home_and_Kitchen,5.0,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen,5.0,"love it, a great upgrade from the original. I...",1
2,Home_and_Kitchen,5.0,This pillow saved my back. I love the look and...,1
3,Home_and_Kitchen,1.0,"Missing information on how to use it, but it i...",1
4,Home_and_Kitchen,5.0,Very nice set. Good quality. We have had the s...,1


In [13]:
# combining the columns of dataset
dataset['text'] = dataset['category'] + dataset['text']

In [14]:
# keeping only neccessary columns
dataset = dataset[['text' , 'label']]

In [15]:
# Shuffling all the dataset
dataset = dataset.sample(dataset.shape[0])

In [16]:
# Cleaning the data
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
import re
import emoji

stemmer = PorterStemmer()


contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [19]:
def clean(x):
    x = x.lower()
    text_contract = []
    for i in x.split(" "):
        if i in contractions:
            text_contract.append(contractions[i])
        else:
            text_contract.append(i)
    string = " ".join(text_contract)
    string = re.sub('http://\S+|https://\S+', '', string)
    string = emoji.demojize(string)
    string = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', string)
    string = re.sub("[^a-zA-Z0-9]", " ", string)
    string = [stemmer.stem(word) for word in string.split(" ") if word not in stopwords.words('english')]
    string = [word for word in string if len(word) >= 3]


    return " ".join(string)



dataset.text = dataset.text.apply(clean)

In [21]:
# how cleaned data looks like
dataset.head()

Unnamed: 0,text,label
12588,movi tvmi mom like actor play detect like acto...,1
38482,cloth shoe jewelrygot requir part bridesmaid o...,0
25453,kindl storethough come expect learn new thing ...,0
34402,toy gamesi rate low inexpens toy also babi one...,1
39952,cloth shoe jewelri div video block r2boe0vzd5o...,1


In [26]:
# Tokenizing the data
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

xtrain , xtest , ytrain , ytest = train_test_split(dataset['text'] , dataset['label'] , test_size = 0.25 , random_state = 42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(xtrain)
xtrain_t = tokenizer.texts_to_sequences(xtrain)
xtest_t = tokenizer.texts_to_sequences(xtest)

vocab = len(tokenizer.word_index) + 1

max_length = max([len(i) for i in xtrain])

xtrain_p = pad_sequences(xtrain_t , maxlen = max_length  , padding = "post")
xtest_p = pad_sequences(xtest_t , maxlen = max_length , padding = "post")

In [30]:
# Making the model to train the model
from tensorflow.keras.layers import LSTM , Dense , Embedding , GlobalAveragePooling1D
from tensorflow.keras import Sequential

model = Sequential()
model.add(Embedding(output_dim = 100 , input_dim = vocab))
model.add(LSTM(100 , return_sequences = True))
model.add(LSTM(50 , return_sequences= True))
model.add(GlobalAveragePooling1D())
model.add(Dense(100 , activation = 'relu'))
model.add(Dense(1 , activation = 'sigmoid'))

model.summary()

model.compile(loss = "binary_crossentropy" , optimizer = "Adam" , metrics = ['accuracy'])

In [31]:
history = model.fit(xtrain_p , ytrain , epochs = 10 , validation_data = (xtest_p , ytest))

Epoch 1/10
[1m 62/950[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m50:22[0m 3s/step - accuracy: 0.5055 - loss: 0.6947

KeyboardInterrupt: 

In [37]:
# Remember in data (1) means a fake review and (0) means real review

review ='''I'm writing the review after 2 months of my regular office usage and it has exceeded my expectations in every aspect. Here's a breakdown of why I believe it's an excellent choice:

1. Efficient Connectivity: The mouse boasts a reliable wireless connection, ensuring seamless performance without any lag. It's a joy to use, especially in fast-paced tasks or gaming scenarios.

2. Affordable Excellence: One of the standout features is its affordability without compromising quality. The Amazon Basics Wireless Optical Mouse offers top-notch performance at a price that doesn't break the bank, making it a budget-friendly yet high-quality option.

3. Compact and Handy: The mouse's compact design makes it incredibly easy to carry around, fitting comfortably in my hand. It's a perfect companion for those on the go, without sacrificing functionality'''

# testing phase

cleaned_text = clean(review)

tokenized = tokenizer.texts_to_sequences([cleaned_text])

padded = pad_sequences(tokenized , maxlen = max_length , padding = "post")

model.predict([padded])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step


array([[0.44666874]], dtype=float32)

In [38]:
# Like we have to to predictions with our model. But now our model is not fully trained.
# After training the model we will same approach to predict the reviews whether they are fake or real.