In [1]:
import numpy as np
import pandas as pd
import spacy
import string
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
import os
import re
from sklearn.metrics import accuracy_score
# /kaggle/input/amazon_reviews.txt

In [2]:
data_df = pd.read_csv("/kaggle/input/amazon_reviews.txt",index_col=0, delimiter = "\t")
print(data_df.shape)
print(data_df.LABEL.value_counts())
data_df.head()
#__label1__ is REAL REVIEW
#__label2__ is FAKE REVIEW

(21000, 8)
__label2__    10500
__label1__    10500
Name: LABEL, dtype: int64


Unnamed: 0_level_0,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [3]:
def get_numeric_label(label):
    return int(label[-3])

PRODUCT_CATEGORY_TO_INTEGER = {'Kitchen': 0, 'Home': 1, 'Grocery': 2, 'Sports': 3, 'Jewelry': 4, 'Home Entertainment': 5, 'Video DVD': 6, 'Books': 7, 'Shoes': 8, 'PC': 9, 'Furniture': 10, 'Video Games': 11, 'Camera': 12, 'Watches': 13, 'Electronics': 14, 'Office Products': 15, 'Health & Personal Care': 16, 'Pet Products': 17, 'Baby': 18, 'Outdoors': 19, 'Toys': 20,'Musical Instruments': 21, 'Wireless': 22, 'Luggage': 23, 'Apparel': 24, 'Lawn and Garden': 25, 'Automotive': 26, 'Tools': 27, 'Beauty': 28, 'Home Improvement': 29}

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
def text_cleaning(text):
        # PREPROCESSING THE DATASET
    text = str(text)
    text = text.lower()
    text = re.sub(r"that's","that is",text)
    text = re.sub(r"there's","there is",text)
    text = re.sub(r"what's","what is",text)
    text = re.sub(r"where's","where is",text)
    text = re.sub(r"it's","it is",text)
    text = re.sub(r"who's","who is",text)
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"she's","she is",text)
    text = re.sub(r"he's","he is",text)
    text = re.sub(r"they're","they are",text)
    text = re.sub(r"who're","who are",text)
    text = re.sub(r"ain't","am not",text)
    text = re.sub(r"wouldn't","would not",text)
    text = re.sub(r"shouldn't","should not",text)
    text = re.sub(r"can't","can not",text)
    text = re.sub(r"couldn't","could not",text)
    text = re.sub(r"won't","will not",text)
    
    text = re.sub(r"\W"," ",text)
    text = re.sub(r"\d"," ",text)
    text = re.sub(r"\s+[a-z]\s+"," ",text)
    text = re.sub(r"^[a-z]\s+"," ",text)    
    text = re.sub(r"\s+[a-z]$"," ",text)    
    text = re.sub(r"\s+"," ",text) 
    
    doc = nlp(text)
    
    tokens= []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)    
    
    filtered_tokens = []
    for token in tokens:
        if token not in STOP_WORDS and token not in string.punctuation:
            filtered_tokens.append(token)
    return filtered_tokens        

In [6]:
data_df["LABEL"] =  data_df["LABEL"].apply(get_numeric_label)
data_df["PRODUCT_CATEGORY"] =  data_df["PRODUCT_CATEGORY"].apply(lambda x: PRODUCT_CATEGORY_TO_INTEGER[x])
data_df["VERIFIED_PURCHASE"] = data_df["VERIFIED_PURCHASE"].apply(lambda x: int(x=="Y"))

In [7]:
%%time
data_df["REVIEW_TEXT"] = data_df['REVIEW_TITLE']+" "+data_df["REVIEW_TEXT"] 
data_df.drop("REVIEW_TITLE",inplace=True,axis=1)
data_df["REVIEW_TEXT"] = data_df['REVIEW_TEXT'].apply(lambda x:text_cleaning(x))

CPU times: user 6min 45s, sys: 2.33 s, total: 6min 48s
Wall time: 6min 48s


In [8]:
data_df.drop("PRODUCT_TITLE",axis=1,inplace=True)
data_df.drop("PRODUCT_ID",axis=1,inplace=True)

In [9]:
data_df.to_csv("data_df.csv",sep="\t",index=False)

In [10]:
data_df2 = pd.read_csv("/kaggle/working/data_df.csv", delimiter = "\t")
data_df2.head(2)

Unnamed: 0,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,REVIEW_TEXT
0,1,4,0,9,"['useful', 'think', 'product', 'save', 'day', ..."
1,1,4,1,22,"['new', 'era', 'battery', 'lithium', 'battery'..."
