In [None]:
import json
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import requests
import nltk
from textblob import TextBlob
from elasticsearch import Elasticsearch


In [None]:
file = r'C:\Users\user\Documents\Python Scripts\data\Video_Games_5.json'

with open(file) as x:
    jsondata = pd.read_json(x, lines=True, chunksize=1000)
    df = pd.DataFrame()
    for d in jsondata:
        df = df.append(d)
x.close()
del x

In [None]:
df['reviewTime'] = pd.to_datetime(df['reviewTime'], format='%m %d, %Y')

reviewdf = df[['asin', 'overall', 'summary', 'reviewText', 'reviewTime']]
cols_list = ['Item', 'Stars', 'Review_Title', 'Review', 'Date']
reviewdf.columns = cols_list
print(reviewdf.head())
print(reviewdf['Review'].head())
#del df commented out so I can quickly reset the dataframe to original state

In [None]:
#Take just a 100 reviews to test code will work
reviewdf = reviewdf[:100]
reviewdf.info()

In [None]:
#Make Reviews all lower case
reviewdf['Review_Clean'] = reviewdf['Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
#Remove punctuation
reviewdf['Review_Clean'] = reviewdf['Review_Clean'].str.replace('[^\w\s]','')

In [None]:
#Remove english stop words
stop = stopwords.words('english')
reviewdf['Review_Clean'] = reviewdf['Review_Clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [None]:
#Remove the 10 most common words
freq = pd.Series(' '.join(reviewdf['Review_Clean']).split()).value_counts()[:10]

freq = list(freq.index)
reviewdf['Review_Clean'] = reviewdf['Review_Clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [None]:
#Remove the 10 rarest words
freq = pd.Series(' '.join(reviewdf['Review_Clean']).split()).value_counts()[-10:]

freq = list(freq.index)
reviewdf['Review_Clean'] = reviewdf['Review_Clean'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [None]:
#Lemmatization and Tokenization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

reviewdf['Review_Token'] = reviewdf['Review_Clean'].apply(lemmatize_text)

In [None]:
#Get Review Length for both Originial Reivew and Cleaned Tokens
reviewdf['Review_Length'] = reviewdf['Review'].apply(lemmatize_text).apply(len)
reviewdf['Review_Clean_Length'] = reviewdf['Review_Token'].apply(len)

In [214]:
#Calculate number of words removed

reviewdf['WordsRemoved'] = reviewdf["Review_Length"] - reviewdf['Review_Clean_Length']

In [216]:
reviewdf.head()

Unnamed: 0,Item,Stars,Review_Title,Review,Date,Review_Clean,Review_Token,Review_Length,Review_Clean_Length,WordsRemoved
0,700099867,1,Pay to unlock content? I don't think so.,Installing the game was a struggle (because of...,2012-07-09,installing struggle windows live bugssome cham...,"[installing, struggle, window, live, bugssome,...",118,55,63
1,700099867,4,Good rally game,If you like rally cars get this game you will ...,2013-06-30,rally cars funit oriented 34european market34 ...,"[rally, car, funit, oriented, 34european, mark...",53,24,29
2,700099867,1,Wrong key,1st shipment received a book instead of the ga...,2014-06-28,1st shipment received book instead game2nd shi...,"[1st, shipment, received, book, instead, game2...",53,27,26
3,700099867,3,"awesome game, if it did not crash frequently !!","I got this version instead of the PS3 version,...",2011-09-14,got version instead ps3 version turned mistake...,"[got, version, instead, ps3, version, turned, ...",646,304,342
4,700099867,4,DIRT 3,I had Dirt 2 on Xbox 360 and it was an okay ga...,2011-06-14,dirt 2 xbox 360 okay started playing laptop bo...,"[dirt, 2, xbox, 360, okay, started, playing, l...",62,22,40
