# HW1

### Import data

In [1]:
import pandas as pd
import numpy as np

In [6]:
products = pd.read_csv('amazon_baby.csv')
products.head(5)
#products.iloc[58]

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


### Fill NA with blank

In [7]:
products = products.fillna({'review':''})

In [8]:
products.iloc[58,:]

name      Our Baby Girl Memory Book
review                             
rating                            5
Name: 58, dtype: object

### Remove punctuation in text

In [9]:
def remove_punctuation(text):
    import string
    return text.translate(str.maketrans('','',string.punctuation)) #translate nothing to nothing 

In [10]:
products['review_clean'] = products['review'].apply(remove_punctuation)
products.head(5)

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


### Transform values in variable rating

In [11]:
products = products[products['rating'] != 3]

In [12]:
def trans(x):
    if x>3:
        return +1
    else:
        return -1
products['sentiment'] = products['rating'].apply(trans)

In [13]:
products.iloc[3,:]  # third observation

name            Stop Pacifier Sucking without tears with Thumb...
review          All of my kids have cried non-stop when I trie...
rating                                                          5
review_clean    All of my kids have cried nonstop when I tried...
sentiment                                                       1
Name: 4, dtype: object

In [14]:
products.iloc[0:3,:] # start:end where start is always included and end is always not included;

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1


In [15]:
products.head(3)

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1


### Split into training and testing dataset

In [16]:
import json
with open('module-2-assignment-train-idx.json') as train_data_file:    
    train_data_idx = json.load(train_data_file)
with open('module-2-assignment-test-idx.json') as test_data_file:    
    test_data_idx = json.load(test_data_file)

print (train_data_idx[:3]) # entry 0,1,2; entry 3 is excluded;
print (test_data_idx[:3])  

[0, 1, 2]
[8, 9, 14]


In [17]:
train_data = products.iloc[train_data_idx]
train_data.head(2)

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1


In [18]:
test_data = products.iloc[test_data_idx]
test_data.head(2)

Unnamed: 0,name,review,rating,review_clean,sentiment
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1


### Build word count vector for each review

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words and skip some unimportant words like 'it'
# First, learn vocabulary from the training data and assign columns to words, Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])
print (vectorizer.vocabulary_)

In [20]:
print(train_matrix[0]) # print word count for first review

  (0, 57486)	3
  (0, 21721)	1
  (0, 37328)	1
  (0, 10505)	3
  (0, 116798)	1
  (0, 72510)	2
  (0, 34453)	1
  (0, 54276)	1
  (0, 63567)	1
  (0, 80500)	1
  (0, 119389)	1
  (0, 14624)	1
  (0, 72811)	1
  (0, 69878)	2
  (0, 119288)	1
  (0, 52830)	1
  (0, 59309)	1
  (0, 75845)	1
  (0, 119315)	1
  (0, 67820)	1
  (0, 35380)	1
  (0, 60973)	1
  (0, 52346)	1
  (0, 87458)	1


In [21]:
print (products.iloc[0,:]) #first review whose rating not equal to 3

name                                        Planetwise Wipe Pouch
review          it came early and was not disappointed. i love...
rating                                                          5
review_clean    it came early and was not disappointed i love ...
sentiment                                                       1
Name: 1, dtype: object


### Train a classifier on train data

In [22]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(max_iter=10000)  ### set max iteration for mle
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(max_iter=10000)

In [23]:
np.sum(sentiment_model.coef_ >= 0)

91020

In [24]:
#print(vectorizer.vocabulary_)

In [25]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':sentiment_model.coef_.flatten()})

ValueError: arrays must all be same length

In [26]:
print (sentiment_model.coef_)

[[-1.23957566e+00  4.45468981e-05  2.74831412e-02 ...  1.25315400e-02
   2.70189341e-03 -4.34431118e-05]]


### Make Predictions for a subset of test data
#### prepare dataset

In [27]:
test_data = products.iloc[test_data_idx]

In [28]:
sample_test_data = test_data.iloc[10:13]  #observation 11,12,13
print (sample_test_data)

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  


#### Make predictions for sub-dataset

In [29]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean']) # sample_test_matrix = word count for sample_test_data
scores = sentiment_model.decision_function(sample_test_matrix)
print (scores)
print (sentiment_model.predict(sample_test_matrix))

[  5.59998522  -3.16970521 -10.42292433]
[ 1 -1 -1]


In [30]:
print ([1./(1+np.exp(-x)) for x in scores])
print (sentiment_model.classes_)
print (sentiment_model.predict_proba(sample_test_matrix))       # predict with probability

[0.9963157058323063, 0.040321820978886216, 2.974188988613961e-05]
[-1  1]
[[3.68429417e-03 9.96315706e-01]
 [9.59678179e-01 4.03218210e-02]
 [9.99970258e-01 2.97418899e-05]]


### Make predictions for full test dataset

In [31]:
test_scores = sentiment_model.decision_function(test_matrix) # decision function is to print score
positive_idx = np.argsort(-test_scores)[:20]      # index of 20, have the highest test score, from the beginning to position 20(excluded)
print (positive_idx)
print (test_scores[positive_idx[0]]) 

[18112 15732 24286 25554 24899  9125 21531 32782 30535 14482  9555 30634
 17558 26830 20743 11923  4140 30076 33060 26838]
53.79872198751669


In [2]:
negative_idx = np.argsort(test_scores)[:20]     
print (negative_idx)
print (test_scores[negative_idx[0]])

In [34]:
predicted_result= sentiment_model.predict(test_matrix)

#### Compute Prediction Accuracy

In [36]:
correct_num = np.sum(predicted_result == test_data['sentiment'])
total_num = len(test_data['sentiment'])
accuracy= correct_num / total_num
print(accuracy)

0.9322954163666907


### Learn another classifier with fewer words

In [38]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']    #limit the bag of words to 20 words

vectorizer = CountVectorizer(vocabulary = significant_words)
train_matrix_word_subset = vectorizer.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer.transform(test_data['review_clean'])

In [39]:
simple_model = LogisticRegression(max_iter=10000)  ### set max iteration for mle
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(max_iter=10000)

In [40]:
np.sum(simple_model.coef_ >0)

10

In [43]:
simple_model_coef_table = pd.DataFrame({'word':significant_words, 'coefficient': simple_model.coef_.flatten()})
simple_model_coef_table.sort_values(by=['coefficient'], ascending=False)

Unnamed: 0,word,coefficient
6,loves,1.673269
5,perfect,1.510263
0,love,1.363697
2,easy,1.192219
1,great,0.94395
4,little,0.520174
7,well,0.50376
8,able,0.190937
3,old,0.085424
9,car,0.058813


### Compare Models
#### Compare classification accuracy of sentiment model and simple model on training set
#### Compare classification accuracy of sentiment model and simple model on training set

In [44]:
train_predicted_y = sentiment_model.predict(train_matrix)
correct_num = np.sum(train_predicted_y == train_data['sentiment'])
total_num = len(train_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
train_accuracy = correct_num * 1./ total_num
print ("sentiment_model training accuracy: {}".format(train_accuracy))

train_predicted_y = simple_model.predict(train_matrix_word_subset)
correct_num = np.sum(train_predicted_y == train_data['sentiment'])
total_num = len(train_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
train_accuracy = correct_num * 1./ total_num
print ("simple_model training accuracy: {}".format(train_accuracy))

correct_num: 129212, total_num: 133416
sentiment_model training accuracy: 0.9684895364873778
correct_num: 115648, total_num: 133416
simple_model training accuracy: 0.8668225700065959


In [45]:
test_predicted_y = sentiment_model.predict(test_matrix)
correct_num = np.sum(test_predicted_y == test_data['sentiment'])
total_num = len(test_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
test_accuracy = correct_num * 1./ total_num
print ("sentiment_model test accuracy: {}".format(test_accuracy))

test_predicted_y = simple_model.predict(test_matrix_word_subset)
correct_num = np.sum(test_predicted_y == test_data['sentiment'])
total_num = len(test_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
test_accuracy = correct_num * 1./ total_num
print ("simple_model test accuracy: {}".format(test_accuracy))

correct_num: 31079, total_num: 33336
sentiment_model test accuracy: 0.9322954163666907
correct_num: 28981, total_num: 33336
simple_model test accuracy: 0.8693604511639069


#### Predict with majority class

In [56]:
positive_label = len(test_data[test_data['sentiment']>0])    # Equivalent to: np.sum(test_data['sentiment']>0)
print (positive_label)
negative_label = len(test_data[test_data['sentiment']<0])    # Equivalent to: np.sum(test_data['sentiment']<0)
print(negative_label)
accuracy_majority_class= positive_label/len(test_data['sentiment'])
print (accuracy_majority_class)

28095
5241
0.8427825773938085
