In [1]:
import numpy as np
import pandas as pd
import gzip

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('data_capstone_2/reviews_Patio_Lawn_and_Garden_5.json.gz')

In [3]:
df.head(1)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1JZFGZEZVWQPY,B00002N674,"Carter H ""1amazonreviewer@gmail . com""","[4, 4]",Good USA company that stands behind their prod...,4.0,Great Hoses,1308614400,"06 21, 2011"


### Information about the DataFrame

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13272 entries, 0 to 13271
Data columns (total 9 columns):
reviewerID        13272 non-null object
asin              13272 non-null object
reviewerName      13107 non-null object
helpful           13272 non-null object
reviewText        13272 non-null object
overall           13272 non-null float64
summary           13272 non-null object
unixReviewTime    13272 non-null int64
reviewTime        13272 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 1.0+ MB


### What we learned from the information:

    * We have the shape, 13272 observations(records or rows) and 9 columns (or variables).
    * There is no missing value.
    * There are two variables related with date but data types are not datetime, one of them is "int64" and the other one is "object". One time related variable will be enough for us, we can drop one of them.
    * We need to figure out that whether the "helpful" variable needs to be converted to numeric type in order to use it.
    * There are two different variables which identify reviewer/user, we can drop one of them.
    * In order to improve practical and readable coding, we need change some of the column names and also we need to convert column names to lowercase.
    
            - "reviewerID"    -->   "customer"
            - "asin"          -->   "product"
            - "reviewerName"  -->   column will be droped 
            - "reviewText"    -->   "review_text" (will be 
            - "helpful"       -->   will be splited in two columns; "pos_feedback" as positive feedback + "neg_feedback" as  negative feedback. 
            - "overall"       -->   "rating"
            - "summary"       -->   as is 
            - "unixReviewTime"-->   "time"    
            - "reviewTime"    -->   column will be droped

### Fixing learned issues

    * Creating the new columns. 
    * Dropping redundant columns
    * Changing some column names and making lowercase

#### Creating 3 new columns

In [5]:
# We will create two new columns from the "helpful" column in order to make computation easier

list1=[]
list2=[]
for item in df['helpful']:
    list1.append(item[0])
    list2.append(item[1]-item[0])
        
# 1st new column
df['pos_feedback'] = list1

# 2nd new column
df['neg_feedback'] = list2

# 3rd new column: will be joint text of review and summary columns.
df['review_text'] = df[['summary', 'reviewText']].apply(
    lambda x: ' '.join(str(y) for y in x if str(y) !='nan'), axis=1)

df.head(1)

# Number of columns increased to 12

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,pos_feedback,neg_feedback,review_text
0,A1JZFGZEZVWQPY,B00002N674,"Carter H ""1amazonreviewer@gmail . com""","[4, 4]",Good USA company that stands behind their prod...,4.0,Great Hoses,1308614400,"06 21, 2011",4,0,Great Hoses Good USA company that stands behin...


#### Dropping redundant 3 columns

In [6]:
# We will drop the "reviewerName" since we have "reviewerID" as enough for identifing the reviewer/customer
# And also we will drop the "reviewTime" column as we have another date column ("unixReviewTime")

df=df.drop(['reviewerName', 'reviewText', 'reviewTime', 'summary', 'helpful'], axis=1)
df.head(1)

# Now we have 8 columns remained

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,pos_feedback,neg_feedback,review_text
0,A1JZFGZEZVWQPY,B00002N674,4.0,1308614400,4,0,Great Hoses Good USA company that stands behin...


#### Changing column names

In [7]:
df.columns = ['customer', 'product', 'rating', 'time', 'pos_feedback', 'neg_feedback', 'review_text']

df.head(1)

Unnamed: 0,customer,product,rating,time,pos_feedback,neg_feedback,review_text
0,A1JZFGZEZVWQPY,B00002N674,4.0,1308614400,4,0,Great Hoses Good USA company that stands behin...


### Descriptive statistics summary

In [8]:
# We can use ".describe()" method to get the statistics summary of numeric variables.

df.describe()

# We got statistics of 4 variables as we have 4 numeric variables.

Unnamed: 0,rating,time,pos_feedback,neg_feedback
count,13272.0,13272.0,13272.0,13272.0
mean,4.186483,1358624000.0,3.233424,0.523282
std,1.084114,47098390.0,20.279594,2.765096
min,1.0,954892800.0,0.0,0.0
25%,4.0,1341965000.0,0.0,0.0
50%,5.0,1370304000.0,0.0,0.0
75%,5.0,1393546000.0,1.0,0.0
max,5.0,1405987000.0,923.0,167.0


#### Statistics of non-numeric variables

In [9]:
# Although they are not numeric we can produce statistics from non-numeric variables

print('\nNumber of unique customers: {}\n\nNumber of unique products: {}'.
      format(len(df['customer'].unique()), len(df['product'].unique())))

print('\nReview per customer: {}\n\nReview per product: {}\n'.
         format((len(df)/len(df['customer'].unique())), (len(df)/len(df['product'].unique()))))

# We produced 4 additional statistics with non-numeric variables.


Number of unique customers: 1686

Number of unique products: 962

Review per customer: 7.871886120996441

Review per product: 13.796257796257796



### What we learned from the statistics summary

    * Rating:
      - Mean of the ratings is more than 4 out of 5. It means that people are tendentious to giving high ratings. "std" value (1.084) and percentile values show that 1 and 2 star ratings are rare. 
      - Small numbers of "ratings under 4" will decrease the predictability of these ratings. To overcome this problem we need to split the ratings in to two groups as "good" and "bad" ratings.

    * total votes (t_votes) and positive votes (p_votes):
      - Their means are more than 3.0 but percentile values shows that more than half of the reviews don't have "helpful"votes.
      - They have outliers and should be cleaned or imputed. 

    * Non-numeric variables statistics:
      - Some customers have more than one ratings and most probably we have some outliers.
      - All ratings do not belong to diffent different people

### Cleaning the Text

In [14]:
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import contractions

#nlp = spacy.load('en_core', parse=True, tag=True, entity=True)

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
                                              
def normalize_corpus(text, lowercase=True, remove_stop_words=True, remove_url=True):
    
    """[1] Lowercase the text 
       [2] Keep only words 
       [3] Find URLs 
       [4] Remove links from posts 
       [5] Expending contractions 
       [6] Removing whitespace 
       [7] Remove apostrophe sign  
       [8] Remove stopwords and Stemming"""
    
    # Creating stopwordlist and editing 
    stopword_list= stopwords.words('english')
    
    # "no" and "not" may give us information so those are removed from stop lists
    stopword_list.remove('no') 
    stopword_list.remove('not')
    
    ##[1] Lowercase the text
    if (lowercase==True):
        text = str(text).lower()
    
    ##[2] Keeping only words
    text = re.sub(r'[^a-zA-Z]',r' ', text)
    
    ##[3] Find URLs
    global URLs
    URLs = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+])+', text)
    #URLs = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    
    ##[4] Removing Links 
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r'', text)
    
    ##[5] Expending contractions such as I'm, you're into I am, you are
    text = contractions.fix(text)
     
    ##[6] Removing whitespace
    text = re.sub(r'nbsp', r'', text)
        
    ##[7] Removing ' (apostrophe) sign
    text = re.sub(r"'", r'', text)
      
    ##[8] Removing stopwords and Lemmatization
    if (remove_stop_words==True):
        
        text = " ".join([lemmatizer.lemmatize(w) for w in text.split(' ') if w not in stopword_list])
        
    else:
        
        text = " ".join([lemmatizer.lemmatize(w) for w in text.split(' ')])
        
    return text

In [15]:
#nltk.download("wordnet", "C:\Users\Mike/nltk_data/")
df['clean_text'] = df['review_text'].map(lambda text: normalize_corpus(text))

In [17]:
# Let's put aside number of raw tokens in order to measure of cleaned tokens
from nltk.tokenize import word_tokenize

raw_tokens=len([w for t in (df["review_text"].apply(word_tokenize)) for w in t])
print('Number of raw tokens: {}'.format(raw_tokens))
clean_tokens=len([w for t in (df["clean_text"].apply(word_tokenize)) for w in t])
print('Number of clean tokens: {}\n'.format(clean_tokens))
print('Percentage of removed tokens: {0:.2f}'.format(1-(clean_tokens/raw_tokens)))

Number of raw tokens: 2438779
Number of clean tokens: 1096586

Percentage of removed tokens: 0.55


In [18]:
df.to_csv('nlp_reviews_cleaned_2.csv', sep=',', encoding='utf-8', index=False)

In [21]:
## Visualizations
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import matplotlib.colors as colors
%matplotlib inline
from IPython.display import Image

## Modeling
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, \
            classification_report, recall_score, precision_recall_curve

import contractions
import spacy
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Warnings
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

In [22]:

df['rating_class'] = df['rating'].apply(lambda x: 0 if x <= 2 else 1)
print(df.rating_class.value_counts())
df_train = df[0:10000]
X_1 = df_train['clean_text']
y_1 = df_train['rating_class']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_1, y_1, test_size=0.25, random_state=5)

cv1 = CountVectorizer(ngram_range=(1,1))
cv_train1 = cv1.fit_transform(X_train1)
cv_train1 = cv_train1.toarray()
cv_test1 = cv1.transform(X_test1)
cv_test1 = cv_test1.toarray()

1    12080
0     1192
Name: rating_class, dtype: int64


In [23]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

clf1 = LogisticRegression(random_state=1)
clf2 = SVC(kernel = 'linear')

labels = ['Logistic Regression', 'Linear SVM']
print('Initial Scores\n\n')

for clf, label in zip([clf1, clf2], labels):
    clf.fit(cv_train1, y_train1)
    y_pred_clf = clf.predict(cv_test1)
    cm = confusion_matrix(y_test1, y_pred_clf)
    
    print('********** [{}] **********\n'.format(label))
    print('1. Accuarcy: {}\n'.format(metrics.accuracy_score(y_test1, y_pred_clf)))
    print('2. The F-1 score of the model {}\n'.format(f1_score(y_test1, y_pred_clf, average='weighted')))
    print('3. The recall score of the model {}\n'.format(recall_score(y_test1, y_pred_clf, average='weighted')))
    print('4. Classification Report:\n{}\n5. Confusion matrix:\n{}\n\n\n'.format
          (classification_report(y_test1, y_pred_clf), cm))

Initial Scores


********** [Logistic Regression] **********

1. Accuarcy: 0.894

2. The F-1 score of the model 0.8866035219097399

3. The recall score of the model 0.894

4. Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.31      0.36       237
           1       0.93      0.96      0.94      2263

   micro avg       0.89      0.89      0.89      2500
   macro avg       0.67      0.63      0.65      2500
weighted avg       0.88      0.89      0.89      2500

5. Confusion matrix:
[[  73  164]
 [ 101 2162]]



********** [Linear SVM] **********

1. Accuarcy: 0.884

2. The F-1 score of the model 0.8844340509161163

3. The recall score of the model 0.884

4. Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.40      0.39       237
           1       0.94      0.94      0.94      2263

   micro avg       0.88      0.88      0.88      2500
   macro avg       0.66      0.67 