In [37]:
import time
import pandas as pd
import numpy as np

In [3]:
pd.set_option('max_colwidth', None)

In [5]:
fakereviewsdf = pd.read_csv('fake_reviews_dataset.csv', names=['category','rating','label','text'])
fakereviewsdf.head()

Unnamed: 0,category,rating,label,text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty"
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years"
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I"
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the set for two months now and have not been


In [6]:
fakereviewsdf['label'].value_counts()

CG    20216
OR    20216
Name: label, dtype: int64

In [7]:
fakereviewsdf['text'] = fakereviewsdf['text'].str.replace('\n',' ')
fakereviewsdf.head()

Unnamed: 0,category,rating,label,text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty"
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years"
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I"
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the set for two months now and have not been


In [8]:
fakereviewsdf['label'] = np.where(fakereviewsdf['label'] == 'CG', 1 ,0)
fakereviewsdf['label'].value_counts()

1    20216
0    20216
Name: label, dtype: int64

Create Features from Punctuation Marks

In [16]:
def punctuation_to_features(df, column):
    df[column] = df[column].str.replace('!',' exclamation ')
    df[column] = df[column].str.replace('?',' question ')
    df[column] = df[column].str.replace('\'',' quotation ')
    df[column] = df[column].str.replace('\"',' quotation ')
    
    return df[column]

In [18]:
fakereviewsdf['text'] = punctuation_to_features(fakereviewsdf, 'text')
# print(punctuation_to_features(fakereviewsdf, 'text'))
fakereviewsdf.head()

  df[column] = df[column].str.replace('?',' question ')


Unnamed: 0,category,rating,label,text
0,Home_and_Kitchen_5,5.0,1,"Love this exclamation Well made, sturdy, and very comfortable. I love it exclamation Very pretty"
1,Home_and_Kitchen_5,5.0,1,"love it, a great upgrade from the original. I quotation ve had mine for a couple of years"
2,Home_and_Kitchen_5,5.0,1,This pillow saved my back. I love the look and feel of this pillow.
3,Home_and_Kitchen_5,1.0,1,"Missing information on how to use it, but it is a great product for the price exclamation I"
4,Home_and_Kitchen_5,5.0,1,Very nice set. Good quality. We have had the set for two months now and have not been


Tokenize the data

    Next we need to take our text column, which is currently stored as a string, and turn it into a Python list of words using a process called tokenization. The NLTK package includes a handy function called word_tokenize() that can be used to perform this task. 

In [19]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/jasmeen/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]

In [22]:
fakereviewsdf['tokenized'] = fakereviewsdf.apply(lambda x : tokenize(x['text']), axis=1)
fakereviewsdf.head()

Unnamed: 0,category,rating,label,text,tokenized
0,Home_and_Kitchen_5,5.0,1,"Love this exclamation Well made, sturdy, and very comfortable. I love it exclamation Very pretty","[Love, this, exclamation, Well, made, sturdy, and, very, comfortable, I, love, it, exclamation, Very, pretty]"
1,Home_and_Kitchen_5,5.0,1,"love it, a great upgrade from the original. I quotation ve had mine for a couple of years","[love, it, a, great, upgrade, from, the, original, I, quotation, ve, had, mine, for, a, couple, of, years]"
2,Home_and_Kitchen_5,5.0,1,This pillow saved my back. I love the look and feel of this pillow.,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]"
3,Home_and_Kitchen_5,1.0,1,"Missing information on how to use it, but it is a great product for the price exclamation I","[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, exclamation, I]"
4,Home_and_Kitchen_5,5.0,1,Very nice set. Good quality. We have had the set for two months now and have not been,"[Very, nice, set, Good, quality, We, have, had, the, set, for, two, months, now, and, have, not, been]"


Stopword Removal

    Next we’ll use an NLP preprocessing technique called stopword removal. Stopword removal, as the name suggests removes “stop words”. These are basically words used so commonly that they’re essentially meaningless to most models, so removing them can improve model speed and, sometimes, accuracy, though rarely by very much.

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasmeen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
from nltk.corpus import  stopwords

def remove_stopwords(tokenized_column):
    stops = set(stopwords.words('english'))
    return [word for word in tokenized_column if not word in stops]

In [25]:
fakereviewsdf['stopwords_removed'] = fakereviewsdf.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
fakereviewsdf.head()

Unnamed: 0,category,rating,label,text,tokenized,stopwords_removed
0,Home_and_Kitchen_5,5.0,1,"Love this exclamation Well made, sturdy, and very comfortable. I love it exclamation Very pretty","[Love, this, exclamation, Well, made, sturdy, and, very, comfortable, I, love, it, exclamation, Very, pretty]","[Love, exclamation, Well, made, sturdy, comfortable, I, love, exclamation, Very, pretty]"
1,Home_and_Kitchen_5,5.0,1,"love it, a great upgrade from the original. I quotation ve had mine for a couple of years","[love, it, a, great, upgrade, from, the, original, I, quotation, ve, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, quotation, mine, couple, years]"
2,Home_and_Kitchen_5,5.0,1,This pillow saved my back. I love the look and feel of this pillow.,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]"
3,Home_and_Kitchen_5,1.0,1,"Missing information on how to use it, but it is a great product for the price exclamation I","[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, exclamation, I]","[Missing, information, use, great, product, price, exclamation, I]"
4,Home_and_Kitchen_5,5.0,1,Very nice set. Good quality. We have had the set for two months now and have not been,"[Very, nice, set, Good, quality, We, have, had, the, set, for, two, months, now, and, have, not, been]","[Very, nice, set, Good, quality, We, set, two, months]"


Apply Porter stemming
    
    Porter Stemming is a technique that is similar to Lemmatization and converts each word to its root or stemmed form, so “comfortable” becomes “comfort”, “information” becomes “inform”, etc.

In [26]:
from nltk.stem.porter import PorterStemmer

def apply_stemming(tokenized_column):
    stemmer = PorterStemmer()
    return [stemmer.stem(word).lower() for word in tokenized_column]

In [29]:
fakereviewsdf['porter_stemmed'] = fakereviewsdf.apply(lambda x  : apply_stemming(x['stopwords_removed']),axis=1)
fakereviewsdf.head()

Unnamed: 0,category,rating,label,text,tokenized,stopwords_removed,porter_stemmed
0,Home_and_Kitchen_5,5.0,1,"Love this exclamation Well made, sturdy, and very comfortable. I love it exclamation Very pretty","[Love, this, exclamation, Well, made, sturdy, and, very, comfortable, I, love, it, exclamation, Very, pretty]","[Love, exclamation, Well, made, sturdy, comfortable, I, love, exclamation, Very, pretty]","[love, exclam, well, made, sturdi, comfort, i, love, exclam, veri, pretti]"
1,Home_and_Kitchen_5,5.0,1,"love it, a great upgrade from the original. I quotation ve had mine for a couple of years","[love, it, a, great, upgrade, from, the, original, I, quotation, ve, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, quotation, mine, couple, years]","[love, great, upgrad, origin, i, quotat, mine, coupl, year]"
2,Home_and_Kitchen_5,5.0,1,This pillow saved my back. I love the look and feel of this pillow.,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]","[thi, pillow, save, back, i, love, look, feel, pillow]"
3,Home_and_Kitchen_5,1.0,1,"Missing information on how to use it, but it is a great product for the price exclamation I","[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, exclamation, I]","[Missing, information, use, great, product, price, exclamation, I]","[miss, inform, use, great, product, price, exclam, i]"
4,Home_and_Kitchen_5,5.0,1,Very nice set. Good quality. We have had the set for two months now and have not been,"[Very, nice, set, Good, quality, We, have, had, the, set, for, two, months, now, and, have, not, been]","[Very, nice, set, Good, quality, We, set, two, months]","[veri, nice, set, good, qualiti, we, set, two, month]"


Rejoin words
    
    Finally, we need to take our porter_stemmed data that has been preprocessed and rejoin the words back into a string.

In [30]:
def rejoin_words(tokenized_columns):
    return (' '.join(tokenized_columns))

In [31]:
fakereviewsdf['all_text'] = fakereviewsdf.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)
fakereviewsdf.head()

Unnamed: 0,category,rating,label,text,tokenized,stopwords_removed,porter_stemmed,all_text
0,Home_and_Kitchen_5,5.0,1,"Love this exclamation Well made, sturdy, and very comfortable. I love it exclamation Very pretty","[Love, this, exclamation, Well, made, sturdy, and, very, comfortable, I, love, it, exclamation, Very, pretty]","[Love, exclamation, Well, made, sturdy, comfortable, I, love, exclamation, Very, pretty]","[love, exclam, well, made, sturdi, comfort, i, love, exclam, veri, pretti]",love exclam well made sturdi comfort i love exclam veri pretti
1,Home_and_Kitchen_5,5.0,1,"love it, a great upgrade from the original. I quotation ve had mine for a couple of years","[love, it, a, great, upgrade, from, the, original, I, quotation, ve, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, quotation, mine, couple, years]","[love, great, upgrad, origin, i, quotat, mine, coupl, year]",love great upgrad origin i quotat mine coupl year
2,Home_and_Kitchen_5,5.0,1,This pillow saved my back. I love the look and feel of this pillow.,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]","[thi, pillow, save, back, i, love, look, feel, pillow]",thi pillow save back i love look feel pillow
3,Home_and_Kitchen_5,1.0,1,"Missing information on how to use it, but it is a great product for the price exclamation I","[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, exclamation, I]","[Missing, information, use, great, product, price, exclamation, I]","[miss, inform, use, great, product, price, exclam, i]",miss inform use great product price exclam i
4,Home_and_Kitchen_5,5.0,1,Very nice set. Good quality. We have had the set for two months now and have not been,"[Very, nice, set, Good, quality, We, have, had, the, set, for, two, months, now, and, have, not, been]","[Very, nice, set, Good, quality, We, set, two, months]","[veri, nice, set, good, qualiti, we, set, two, month]",veri nice set good qualiti we set two month


        Create training and test data

In [32]:
X = fakereviewsdf['all_text']
Y = fakereviewsdf['label']

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1, shuffle=True)

    Run the model selection process

In [39]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier

classifiers = {}

classifiers.update({'XGBClassifier': XGBClassifier(eval_metric='logloss',objective='binary:logistic')})
classifiers.update({'CatBoostClassifier': CatBoostClassifier(silent=True)})
classifiers.update({'MultinomialNB': MultinomialNB()})
classifiers.update({'LinearSVC': LinearSVC()})
classifiers.update({'RandomForestClassifier': RandomForestClassifier()})
classifiers.update({'AdaBoostClassifier': AdaBoostClassifier()})
classifiers.update({'BaggingClassifier': BaggingClassifier()})
classifiers.update({'DecisionTreeClassifier': DecisionTreeClassifier()})
classifiers.update({'ExtraTreeClassifier': ExtraTreeClassifier()})
classifiers.update({'KNeighborsClassifier': KNeighborsClassifier()})
classifiers.update({'RidgeClassifier': RidgeClassifier()})
classifiers.update({'SGDClassifier': SGDClassifier()})
classifiers.update({'BernoulliNB': BernoulliNB()})

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

fakereviewsdf_models = pd.DataFrame(columns=['model', 'run_time', 'roc_auc', 'roc_auc_std'])

for key in classifiers:
    start_time = time.time()
    pipeline = Pipeline([
        ('tfidf',TfidfVectorizer()),
        ('clf', classifiers[key])
    ])
    cv = cross_val_score(pipeline,X, Y, cv=5, scoring='roc_auc')

    row = {
        'model' : key,
        'run_time': format(round((time.time() - start_time)/60,2)),
        'roc_auc': cv.mean(),
        'roc_auc_std': cv.std(),
    }

    fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)

fakereviewsdf_models = fakereviewsdf_models.sort_values(by='roc_auc', ascending=False)

fakereviewsdf_models

  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)
  fakereviewsdf_models = fakereviewsdf_models.append(row, ignore_index=True)

Unnamed: 0,model,run_time,roc_auc,roc_auc_std
11,SGDClassifier,0.05,0.927391,0.007802
1,CatBoostClassifier,6.28,0.925181,0.009282
3,LinearSVC,0.05,0.924436,0.011311
10,RidgeClassifier,0.06,0.924433,0.012062
0,XGBClassifier,0.38,0.917171,0.009368
4,RandomForestClassifier,1.55,0.914678,0.012768
2,MultinomialNB,0.05,0.903128,0.019442
6,BaggingClassifier,3.2,0.858919,0.008341
5,AdaBoostClassifier,0.34,0.852309,0.008306
12,BernoulliNB,0.05,0.827642,0.020536


In [43]:
bundled_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()), 
    ("clf", SGDClassifier())
])

bundled_pipeline.fit(X_train, y_train)
y_hat = bundled_pipeline.predict(X_test)

y_hat

array([1, 0, 1, ..., 0, 0, 1])

In [44]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

print('Accuracy:', accuracy_score(y_test, y_hat))
print('Precision:', precision_score(y_test, y_hat))
print('Recall:', recall_score(y_test, y_hat))
print('ROC/AUC:', roc_auc_score(y_test, y_hat))

Accuracy: 0.8754328112118714
Precision: 0.8974403320650294
Recall: 0.8496806942852464
ROC/AUC: 0.8756123876539963
