In [2]:
import nltk
import pandas as pd
import re

from langdetect import detect

The script assumes it is run from 'scripts' directory.

In [7]:
amazon_initial = pd.read_csv('../data/amazon_initial.csv')

In this project we only work with texts of the reviews and associated veracity.<br />
Dataset source: https://www.kaggle.com/lievgarcia/amazon-reviews

In [3]:
amazon = amazon_initial[['REVIEW_TEXT', 'LABEL']].copy()
amazon.rename(columns={'LABEL': 'VERACITY'}, inplace=True)
amazon['VERACITY'] = amazon['VERACITY'].map({'__label1__': 'Fake', '__label2__': 'Real'})
amazon

Unnamed: 0,REVIEW_TEXT,VERACITY
0,"When least you think so, this product will sav...",Fake
1,Lithium batteries are something new introduced...,Fake
2,I purchased this swing for my baby. She is 6 m...,Fake
3,I was looking for an inexpensive desk calcolat...,Fake
4,I only use it twice a week and the results are...,Fake
...,...,...
20995,"I bought these for work. I have high arches, ...",Real
20996,Crocs are one of only two brands of shoes that...,Real
20997,I love moccasins This fit like it was custom ...,Real
20998,I wish these were a little more durable. I got...,Real


First, remove all non-English entries.

In [4]:
amazon['lang'] = amazon['REVIEW_TEXT'].apply(lambda x: detect(x))
amazon = amazon.loc[amazon['lang'] == 'en'].copy()
amazon.reset_index(drop=True, inplace=True)
amazon.drop(columns=['lang'], inplace = True)

In [5]:
def clean_text(text):
    """Return a version of text cleaned of common contractions and non-standard."""
    
    # Process common contractions.
    text = re.sub(r"can't", 'cannot', text)
    text = re.sub(r"\'ve", ' have ', text)
    text = re.sub(r"n't", ' not ', text)
    text = re.sub(r"im", 'I am ', text)
    text = re.sub(r"i'm", 'I am ', text)
    text = re.sub(r"\'ll", ' will ', text)    
    text = re.sub(r"\'s", ' ', text)
    
    # Remove <br> HTML tag.
    text = re.sub(r"<br />", ' ', text)
    
    # Remove remaining numbers and non-standard punctuation.
    text = re.sub(r'[^a-zA-Z0-9\.\,\!\?\:\)\(\;]', ' ', text)

    return text

In [6]:
amazon['REVIEW_TEXT'] = amazon['REVIEW_TEXT'].apply(lambda x: clean_text(x))
amazon

Unnamed: 0,REVIEW_TEXT,VERACITY
0,"When least you think so, this product will sav...",Fake
1,Lithium batteries are something new introduced...,Fake
2,I purchased this swing for my baby. She is 6 m...,Fake
3,I was looking for an inexpensive desk calcolat...,Fake
4,I only use it twice a week and the results are...,Fake
...,...,...
20969,"I bought these for work. I have high arches, ...",Real
20970,Crocs are one of only two brands of shoes that...,Real
20971,I love moccasins This fit like it was custom ...,Real
20972,I wish these were a little more durable. I got...,Real


In [8]:
amazon.to_csv('data/preprocessed/amazon.csv', index=False)