##### Library

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import pandas as pd
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

##### Read Data

In [3]:
df_reviews = pd.read_json(f'../../data/interim/final/labelled_reviews_by_ratings.json.gz', orient="records", compression="gzip")

In [4]:
df_reviews.head()

Unnamed: 0,user_id,product_id,ratings,review_text,summary,created_at,sentiment
0,A0203183BAH3TR08FZGB,B0043T7FHK,5,I got this to run as a dual monitor. This is ...,This is my second time purchasing this monitor...,2015-06-30,1
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,4,"Not as good as I had hoped, music is very low,...",Bluetooth headset,2014-08-03,1
2,A034116598G557EYZ9BC,B0013FRNKG,5,Appreciate if product\nNeed to buy one more if...,great value,2012-11-28,1
3,A0404374X0HL5T332XSN,B00MNOPS1C,3,You get what you pay for,Three Stars,2016-02-02,1
4,A0431622H67YR5IPJRN,B0058UUR6E,5,Arrived in 2 days. working great. Recommend to...,working great. Recommend to others,2015-03-11,1


##### Clean Data

In [5]:
def lowercase_text(text: str):
    return text.lower()

def remove_html_tags(text: str):
    return re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)

def remove_url(text: str):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_punctuation_and_number(text: str):
    return re.sub(r'[^a-zA-Z]', ' ', text)

def remove_extra_spaces(text: str):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def remove_stopwords(text: str):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

def remove_n_chars(text: str, n = 2):
    return ' '.join( [w for w in text.split(' ') if len(w)>n] )

def clean_data(text: str):
    text = lowercase_text(text)
    text = remove_html_tags(text)
    text = remove_url(text)
    text = remove_punctuation_and_number(text)
    text = remove_extra_spaces(text)
    text = remove_stopwords(text)
    text = remove_n_chars(text)
    return text

In [6]:
df_preprocessing = df_reviews[['review_text', 'summary', 'sentiment']].copy()

In [7]:
df_preprocessing['concatenated_review'] = df_preprocessing[['review_text', 'summary']].apply(
    lambda x : " ".join(str(y) for y in x if str(y) != 'nan'),
    axis=1
)

df_preprocessing = df_preprocessing[['concatenated_review', 'sentiment']]

In [8]:
df_preprocessing.head()

Unnamed: 0,concatenated_review,sentiment
0,I got this to run as a dual monitor. This is ...,1
1,"Not as good as I had hoped, music is very low,...",1
2,Appreciate if product\nNeed to buy one more if...,1
3,You get what you pay for Three Stars,1
4,Arrived in 2 days. working great. Recommend to...,1


In [9]:
df_preprocessing['cleaned_review'] = df_preprocessing['concatenated_review'].apply(clean_data)

In [10]:
df_preprocessing.head()

Unnamed: 0,concatenated_review,sentiment,cleaned_review
0,I got this to run as a dual monitor. This is ...,1,got run dual monitor second time purchasing mo...
1,"Not as good as I had hoped, music is very low,...",1,good hoped music low phone volume pretty good ...
2,Appreciate if product\nNeed to buy one more if...,1,appreciate product need buy one promotion need...
3,You get what you pay for Three Stars,1,get pay three stars
4,Arrived in 2 days. working great. Recommend to...,1,arrived days working great recommend others wo...


##### Normalize Data

Part of Speech Tagging

In [11]:
pos_dict = {
    'J': wordnet.ADJ, 
    'V': wordnet.VERB, 
    'N': wordnet.NOUN, 
    'R': wordnet.ADV
}

def tag_pos(text):
    result = []
    tags = pos_tag(word_tokenize(text))

    for word, tag in tags:
        result.append(tuple([word, pos_dict.get(tag[0])]))
            
    return result

In [12]:
df_preprocessing['pos_tagged_review'] = df_preprocessing['cleaned_review'].apply(tag_pos)

In [13]:
df_preprocessing.head()

Unnamed: 0,concatenated_review,sentiment,cleaned_review,pos_tagged_review
0,I got this to run as a dual monitor. This is ...,1,got run dual monitor second time purchasing mo...,"[(got, v), (run, v), (dual, a), (monitor, n), ..."
1,"Not as good as I had hoped, music is very low,...",1,good hoped music low phone volume pretty good ...,"[(good, a), (hoped, v), (music, n), (low, a), ..."
2,Appreciate if product\nNeed to buy one more if...,1,appreciate product need buy one promotion need...,"[(appreciate, a), (product, n), (need, n), (bu..."
3,You get what you pay for Three Stars,1,get pay three stars,"[(get, v), (pay, n), (three, None), (stars, n)]"
4,Arrived in 2 days. working great. Recommend to...,1,arrived days working great recommend others wo...,"[(arrived, v), (days, n), (working, v), (great..."


Lemmatization

In [14]:
lemmatizer = WordNetLemmatizer()

def lemmatize_pos_words(pos_words):
    lemmas = []
    for word, tag in pos_words:
        if tag is None:
            lemmas.append(word)
        else:
            lemmas.append(lemmatizer.lemmatize(word, tag))
            
    return " ".join(lemmas)

In [15]:
df_preprocessing['lemmatized_review'] = df_preprocessing['pos_tagged_review'].apply(lemmatize_pos_words)

In [16]:
df_preprocessing = df_preprocessing[df_preprocessing['lemmatized_review'].str.len() > 0]
df_preprocessing.head()

Unnamed: 0,concatenated_review,sentiment,cleaned_review,pos_tagged_review,lemmatized_review
0,I got this to run as a dual monitor. This is ...,1,got run dual monitor second time purchasing mo...,"[(got, v), (run, v), (dual, a), (monitor, n), ...",get run dual monitor second time purchase moni...
1,"Not as good as I had hoped, music is very low,...",1,good hoped music low phone volume pretty good ...,"[(good, a), (hoped, v), (music, n), (low, a), ...",good hop music low phone volume pretty good bl...
2,Appreciate if product\nNeed to buy one more if...,1,appreciate product need buy one promotion need...,"[(appreciate, a), (product, n), (need, n), (bu...",appreciate product need buy one promotion need...
3,You get what you pay for Three Stars,1,get pay three stars,"[(get, v), (pay, n), (three, None), (stars, n)]",get pay three star
4,Arrived in 2 days. working great. Recommend to...,1,arrived days working great recommend others wo...,"[(arrived, v), (days, n), (working, v), (great...",arrive day work great recommend others work gr...


##### Export as Json

In [17]:
df_preprocessing[[
    "lemmatized_review",
    "sentiment"
]].rename(
    columns={'lemmatized_review':'cleaned_review'}
).to_json(
    '../../data/processed/reviews.json.gz', compression="gzip", orient="records", indent=2
)