Import Library

In [158]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [159]:
import pandas as pd
import re
import unicodedata

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Read Data

In [160]:
df_reviews = pd.read_json('../../data/final/reviews.json.gz', orient="records", compression="gzip")

In [161]:
df_reviews.head()

Unnamed: 0,user_id,product_id,ratings,review_text,summary,created_at
0,A0203183BAH3TR08FZGB,B0043T7FHK,5,I got this to run as a dual monitor. This is ...,This is my second time purchasing this monitor...,2015-06-30
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,4,"Not as good as I had hoped, music is very low,...",Bluetooth headset,2014-08-03
2,A034116598G557EYZ9BC,B0013FRNKG,5,Appreciate if product\nNeed to buy one more if...,great value,2012-11-28
3,A0404374X0HL5T332XSN,B00MNOPS1C,3,You get what you pay for,Three Stars,2016-02-02
4,A0431622H67YR5IPJRN,B0058UUR6E,5,Arrived in 2 days. working great. Recommend to...,working great. Recommend to others,2015-03-11


Clean Data

In [162]:
def lowercase_text(text: str):
    return text.lower()

def remove_html_tags(text: str):
    return re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)

def remove_url(text: str):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_accented_chars(text: str):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def remove_punctuation_and_number(text: str):
    return re.sub(r'[^a-zA-Z]', ' ', text)

def remove_extra_spaces(text: str):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def clean_data(text: str):
    text = lowercase_text(text)
    text = remove_html_tags(text)
    text = remove_url(text)
    text = remove_accented_chars(text)
    text = remove_punctuation_and_number(text)
    text = remove_extra_spaces(text)
    return text

In [163]:
df_reviews['concatenated_review'] = df_reviews[['review_text', 'summary']].apply(
    lambda x : " ".join(str(y) for y in x if str(y) != 'nan'),
    axis=1
)

df_preprocessing_data = df_reviews[['concatenated_review']].rename(columns = {"concatenated_review" : "review_text"})

In [164]:
df_preprocessing_data['first_cleaned_review'] = df_preprocessing_data['review_text'].apply(clean_data)

In [165]:
df_preprocessing_data.head()

Unnamed: 0,review_text,first_cleaned_review
0,I got this to run as a dual monitor. This is ...,i got this to run as a dual monitor this is my...
1,"Not as good as I had hoped, music is very low,...",not as good as i had hoped music is very low p...
2,Appreciate if product\nNeed to buy one more if...,appreciate if product need to buy one more if ...
3,You get what you pay for Three Stars,you get what you pay for three stars
4,Arrived in 2 days. working great. Recommend to...,arrived in days working great recommend to oth...


Tokenization

In [166]:
df_preprocessing_data['tokenized_review'] = df_preprocessing_data['first_cleaned_review'].apply(word_tokenize)

In [167]:
df_preprocessing_data.head()

Unnamed: 0,review_text,first_cleaned_review,tokenized_review
0,I got this to run as a dual monitor. This is ...,i got this to run as a dual monitor this is my...,"[i, got, this, to, run, as, a, dual, monitor, ..."
1,"Not as good as I had hoped, music is very low,...",not as good as i had hoped music is very low p...,"[not, as, good, as, i, had, hoped, music, is, ..."
2,Appreciate if product\nNeed to buy one more if...,appreciate if product need to buy one more if ...,"[appreciate, if, product, need, to, buy, one, ..."
3,You get what you pay for Three Stars,you get what you pay for three stars,"[you, get, what, you, pay, for, three, stars]"
4,Arrived in 2 days. working great. Recommend to...,arrived in days working great recommend to oth...,"[arrived, in, days, working, great, recommend,..."


Stop Word Removal

In [168]:
stopword_list = stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

def remove_stopwords(words):
    return [word for word in words if word not in stopword_list]

def concatenate_words(words):
    return ' '.join(words)

In [169]:
df_preprocessing_data['tokenized_filtered_review'] = df_preprocessing_data['tokenized_review'].apply(remove_stopwords)
df_preprocessing_data['second_cleaned_review'] = df_preprocessing_data['tokenized_filtered_review'].apply(concatenate_words)

In [170]:
df_preprocessing_data.head()

Unnamed: 0,review_text,first_cleaned_review,tokenized_review,tokenized_filtered_review,second_cleaned_review
0,I got this to run as a dual monitor. This is ...,i got this to run as a dual monitor this is my...,"[i, got, this, to, run, as, a, dual, monitor, ...","[got, run, dual, monitor, second, time, purcha...",got run dual monitor second time purchasing mo...
1,"Not as good as I had hoped, music is very low,...",not as good as i had hoped music is very low p...,"[not, as, good, as, i, had, hoped, music, is, ...","[not, good, hoped, music, low, phone, volume, ...",not good hoped music low phone volume pretty g...
2,Appreciate if product\nNeed to buy one more if...,appreciate if product need to buy one more if ...,"[appreciate, if, product, need, to, buy, one, ...","[appreciate, product, need, buy, one, promotio...",appreciate product need buy one promotion need...
3,You get what you pay for Three Stars,you get what you pay for three stars,"[you, get, what, you, pay, for, three, stars]","[get, pay, three, stars]",get pay three stars
4,Arrived in 2 days. working great. Recommend to...,arrived in days working great recommend to oth...,"[arrived, in, days, working, great, recommend,...","[arrived, days, working, great, recommend, oth...",arrived days working great recommend others wo...


Export as Json

In [171]:
df_preprocessing_data[[
    "review_text",
    "second_cleaned_review"
]].rename(
    columns={'second_cleaned_review':'cleaned_review'}
).to_json(
    '../data/cleaned_reviews.json.gz', compression="gzip", orient="records", indent=2
)