Import Library

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import pandas as pd

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

Read Data

In [3]:
df_cleaned_reviews = pd.read_json('../data/cleaned_reviews.json.gz', orient="records", compression="gzip")

In [4]:
df_cleaned_reviews.head()

Unnamed: 0,review_text,cleaned_review
0,I got this to run as a dual monitor. This is ...,got run dual monitor second time purchasing mo...
1,"Not as good as I had hoped, music is very low,...",not good hoped music low phone volume pretty g...
2,Appreciate if product\nNeed to buy one more if...,appreciate product need buy one promotion need...
3,You get what you pay for Three Stars,get pay three stars
4,Arrived in 2 days. working great. Recommend to...,arrived days working great recommend others wo...


Part of Speech Tagging

In [5]:
pos_dict = {
    'J': wordnet.ADJ, 
    'V': wordnet.VERB, 
    'N': wordnet.NOUN, 
    'R': wordnet.ADV
}

def tag_pos(text):
    result = []
    tags = pos_tag(word_tokenize(text))

    for word, tag in tags:
        result.append(tuple([word, pos_dict.get(tag[0])]))
            
    return result

In [6]:
df_preprocessing_data = df_cleaned_reviews

In [7]:
df_preprocessing_data['pos_tagged_review'] = df_preprocessing_data['cleaned_review'].apply(tag_pos)

In [8]:
df_preprocessing_data.head()

Unnamed: 0,review_text,cleaned_review,pos_tagged_review
0,I got this to run as a dual monitor. This is ...,got run dual monitor second time purchasing mo...,"[(got, v), (run, v), (dual, a), (monitor, n), ..."
1,"Not as good as I had hoped, music is very low,...",not good hoped music low phone volume pretty g...,"[(not, r), (good, a), (hoped, v), (music, n), ..."
2,Appreciate if product\nNeed to buy one more if...,appreciate product need buy one promotion need...,"[(appreciate, a), (product, n), (need, n), (bu..."
3,You get what you pay for Three Stars,get pay three stars,"[(get, v), (pay, n), (three, None), (stars, n)]"
4,Arrived in 2 days. working great. Recommend to...,arrived days working great recommend others wo...,"[(arrived, v), (days, n), (working, v), (great..."


Lemmatization

In [9]:
lemmatizer = WordNetLemmatizer()

def lemmatize_pos_words(pos_words):
    lemmas = []
    for word, tag in pos_words:
        if tag is None:
            lemmas.append(word)
        else:
            lemmas.append(lemmatizer.lemmatize(word, tag))
            
    return " ".join(lemmas)

In [10]:
df_preprocessing_data['lemmatized_review'] = df_preprocessing_data['pos_tagged_review'].apply(lemmatize_pos_words)

In [11]:
df_preprocessing_data.head()

Unnamed: 0,review_text,cleaned_review,pos_tagged_review,lemmatized_review
0,I got this to run as a dual monitor. This is ...,got run dual monitor second time purchasing mo...,"[(got, v), (run, v), (dual, a), (monitor, n), ...",get run dual monitor second time purchase moni...
1,"Not as good as I had hoped, music is very low,...",not good hoped music low phone volume pretty g...,"[(not, r), (good, a), (hoped, v), (music, n), ...",not good hop music low phone volume pretty goo...
2,Appreciate if product\nNeed to buy one more if...,appreciate product need buy one promotion need...,"[(appreciate, a), (product, n), (need, n), (bu...",appreciate product need buy one promotion need...
3,You get what you pay for Three Stars,get pay three stars,"[(get, v), (pay, n), (three, None), (stars, n)]",get pay three star
4,Arrived in 2 days. working great. Recommend to...,arrived days working great recommend others wo...,"[(arrived, v), (days, n), (working, v), (great...",arrive day work great recommend others work gr...


Export as Json

In [12]:
df_preprocessing_data[[
    "review_text",
    "lemmatized_review"
]
].rename(
    columns={'lemmatized_review':'normalized_review'}
).to_json(
    '../data/normalized_reviews.json.gz', compression="gzip", orient="records", indent=2
)