In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('reviews.csv')

In [3]:
df.head()

Unnamed: 0,titletext,label
0,pinnacle seems cushioning husband likes better...,0
1,great price product though sizes tend bigger b...,0
2,good fit even washing hot water force shrinkin...,0
3,great shoe ive nikes always pleased comfort pe...,0
4,looks bit nicer picture light way held box won...,0


In [None]:
# df = pd.concat([df_fashion, df_appliances, df_software], axis=0, ignore_index=True)

In [None]:
# df.head()

In [4]:
len(df.index)

7161

In [5]:
df['label'].value_counts()

1    3581
0    3580
Name: label, dtype: int64

In [8]:
# !pip install -U nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
# Tokenizer
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [12]:
# "Run" Tokenizer
df['text_tokens'] = df['titletext'].map(tokenizer.tokenize)

In [13]:
df.head()

Unnamed: 0,titletext,label,text_tokens
0,pinnacle seems cushioning husband likes better...,0,"[pinnacle, seems, cushioning, husband, likes, ..."
1,great price product though sizes tend bigger b...,0,"[great, price, product, though, sizes, tend, b..."
2,good fit even washing hot water force shrinkin...,0,"[good, fit, even, washing, hot, water, force, ..."
3,great shoe ive nikes always pleased comfort pe...,0,"[great, shoe, ive, nikes, always, pleased, com..."
4,looks bit nicer picture light way held box won...,0,"[looks, bit, nicer, picture, light, way, held,..."


In [14]:
# Importing lemmatizer 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
# Instantiating lemmatizer 
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [15]:
lemmatize_words=[]
for i in range (len(df['text_tokens'])):
  word=''
  for j in range(len(df['text_tokens'][i])):
    lemm_word=lemmatizer.lemmatize(df['text_tokens'][i][j])#lemmatize
    word=word + ' '+lemm_word # joining tokens into sentence    
  lemmatize_words.append(word) # store in list

In [16]:
#creating a new column to store the result
df['text_lemmatized']=lemmatize_words

In [17]:
len(df.index)

7161

In [18]:
df.head()

Unnamed: 0,titletext,label,text_tokens,text_lemmatized
0,pinnacle seems cushioning husband likes better...,0,"[pinnacle, seems, cushioning, husband, likes, ...",pinnacle seems cushioning husband like better...
1,great price product though sizes tend bigger b...,0,"[great, price, product, though, sizes, tend, b...",great price product though size tend bigger b...
2,good fit even washing hot water force shrinkin...,0,"[good, fit, even, washing, hot, water, force, ...",good fit even washing hot water force shrinki...
3,great shoe ive nikes always pleased comfort pe...,0,"[great, shoe, ive, nikes, always, pleased, com...",great shoe ive nike always pleased comfort pe...
4,looks bit nicer picture light way held box won...,0,"[looks, bit, nicer, picture, light, way, held,...",look bit nicer picture light way held box won...


In [19]:
#imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [32]:
#defining X and y for the model
X = df['text_lemmatized']
y = df['label']
# Spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.15)
y_train.value_counts()

1    3043
0    3043
Name: label, dtype: int64

In [33]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [34]:
pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', LogisticRegression())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfid', TfidfTransformer()),
                ('model', LogisticRegression())])

In [35]:
y_pred = pipe.predict(X_test)

In [36]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.9134883720930232

In [37]:
metrics.confusion_matrix(y_test, y_pred)

array([[488,  49],
       [ 44, 494]])

In [40]:
print(metrics.classification_report(y_test, y_pred, target_names=['Real reviews','Fake reviews']))

              precision    recall  f1-score   support

Real reviews       0.92      0.91      0.91       537
Fake reviews       0.91      0.92      0.91       538

    accuracy                           0.91      1075
   macro avg       0.91      0.91      0.91      1075
weighted avg       0.91      0.91      0.91      1075

