## EDA & Data cleaning

In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import json
import string
import nltk
from nltk.corpus import stopwords
from  nltk import FreqDist
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import itertools
import text_cleaning_helper_functions as clean
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = [json.loads(line) for line in open('reviews_Electronics_5.json', 'r')]

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,528881469,"[0, 0]",5.0,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",AO94DHGC771SJ,amazdnu,Gotta have GPS!,1370131200
1,528881469,"[12, 15]",1.0,"I'm a professional OTR truck driver, and I bou...","11 25, 2010",AMO214LNFCEI4,Amazon Customer,Very Disappointed,1290643200
2,528881469,"[43, 45]",3.0,"Well, what can I say. I've had this unit in m...","09 9, 2010",A3N7T0DY83Y4IG,C. A. Freeman,1st impression,1283990400
3,528881469,"[9, 10]",2.0,"Not going to write a long review, even thought...","11 24, 2010",A1H8PY3QHMQQA0,"Dave M. Shaw ""mack dave""","Great grafics, POOR GPS",1290556800
4,528881469,"[0, 0]",1.0,I've had mine for a year and here's what we go...,"09 29, 2011",A24EV6RXELQZ63,Wayne Smith,"Major issues, only excuses for support",1317254400


In [5]:
df.isna().sum()

asin                  0
helpful               0
overall               0
reviewText            0
reviewTime            0
reviewerID            0
reviewerName      24730
summary               0
unixReviewTime        0
dtype: int64

In [6]:
df.shape

(1689188, 9)

In [7]:
df['overall'].value_counts()

5.0    1009026
4.0     347041
3.0     142257
1.0     108725
2.0      82139
Name: overall, dtype: int64

In [10]:
# drop reviews to reduce class imabalance and reduce size of data
remove_n_5 = 900000
remove_n_4 = 247000
remove_n_3 = 40000

just5 = df.loc[df['overall'] == 5]
just4 = df.loc[df['overall'] == 4]
just3 = df.loc[df['overall'] == 3]

drop_indices_5 = np.random.choice(just5.index, remove_n_5, replace=False)
drop_indices_4 = np.random.choice(just4.index, remove_n_4, replace=False)
drop_indices_3 = np.random.choice(just3.index, remove_n_3, replace=False)

df_final = df.drop(drop_indices_5)
df_final = df_final.drop(drop_indices_4)
df_final = df_final.drop(drop_indices_3)

In [11]:
df_final['overall'].value_counts()

5.0    109026
1.0    108725
3.0    102257
4.0    100041
2.0     82139
Name: overall, dtype: int64

In [12]:
df_final.reset_index(drop=True,inplace=True)

In [13]:
df_text = df_final[['asin','reviewText','overall','reviewerID']]

In [14]:
df_text['asin_reviewer'] = df_text['asin'].astype(str)+ '_' + df_text['reviewerID'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
df_text.drop(columns=['asin','reviewerID'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


# NLP

In [17]:
# expand list of stopwords
new_stopwords = [x.replace("'","") for x in stopwords.words('english')]
final_stopwords = set(new_stopwords+stopwords.words('english')+list(string.punctuation))

In [22]:
# take out review text from dataframe and make it a series
reviews = df_text['reviewText']

In [25]:
# perform the cleaning, tokenizing, and lemmatizing of each review and save it to corpus
corpus = clean.finalize_token(reviews)

100%|██████████| 502188/502188 [1:03:21<00:00, 132.09it/s]


In [23]:
# put the corpus in a dataframe
df_corpus = pd.DataFrame(corpus, columns=['reviews'])

NameError: name 'corpus' is not defined

In [28]:
# merge corpus into original dataframe so we can begin modeling
df_reviews_final = pd.concat([df_text,df_corpus],axis=1)

In [34]:
# save final dataframe to csv
df_reviews_final.to_csv('final_df.csv',index=None,header=True)

## Feature engineering

In [None]:
# lenght of review
# number of !'s
# number of 

In [16]:
df_text['review_length'] = df_text['reviewText'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Visuals

In [None]:
g = sns.FacetGrid(data=df_text, col='overall')
g.map(plt.hist, 'review_length', bins=30)
g.set(xlim=(0,10000))

In [None]:
sns.boxplot(x='overall', y='review_length', data=df_text)