<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
# Natural Langurage Processing (NLP) - "Sentimental Analysis for Amazon Book Review"
# Name: Jianlei(John) Sun
# Date: March 13, 2017

# Libraries

In [3]:
%matplotlib inline

import time
import functools
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sparse

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split



# Data Preprocessing

## Download Dataset

In [144]:
# Dataset from "https://snap.stanford.edu/data/web-Amazon.html"

In [12]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('./Kindle_Store_5.json.gz')

In [69]:
df = df[['overall', 'reviewText']]
df['overall'] = df[['overall']].apply(lambda x: 'pos' if x[0] >= 4.0 else 'neg', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [72]:
df.head()

Unnamed: 0,overall,reviewText
0,pos,I enjoy vintage books and movies so I enjoyed ...
1,pos,This book is a reissue of an old one; the auth...
2,pos,This was a fairly interesting read. It had ol...
3,pos,I'd never read any of the Amy Brewster mysteri...
4,pos,"If you like period pieces - clothing, lingo, y..."


In [77]:
df.to_csv('sampled_data.csv', index = False)

## Tokenization & Lemmatization

In [4]:
# Load csv file into DataFrame
kindle_data = pd.read_csv('sampled_data.csv')
type(kindle_data)

pandas.core.frame.DataFrame

In [4]:
# Print first row
# Format: data_frame.col_nam[row]
print("overall    :", kindle_data.overall[0])
print("reviewText :", kindle_data.reviewText[0])

overall    : pos
reviewText : I enjoy vintage books and movies so I enjoyed reading this book.  The plot was unusual.  Don't think killing someone in self-defense but leaving the scene and the body without notifying the police or hitting someone in the jaw to knock them out would wash today.Still it was a good read for me.


In [5]:
# Get a sample (head) of the data frame
kindle_data.head()

Unnamed: 0,overall,reviewText
0,pos,I enjoy vintage books and movies so I enjoyed ...
1,pos,This book is a reissue of an old one; the auth...
2,pos,This was a fairly interesting read. It had ol...
3,pos,I'd never read any of the Amy Brewster mysteri...
4,pos,"If you like period pieces - clothing, lingo, y..."


In [6]:
# Statics on tags
kindle_data.overall.value_counts()

pos    829277
neg    153342
Name: overall, dtype: int64

In [7]:
def splitPosNeg(data_):
    neg = data_.loc[data_.overall=='neg']
    pos = data_.loc[data_.overall=='pos']
    return [pos,neg]

[pos,neg] = splitPosNeg(kindle_data)

In [8]:
print(type(pos))
print("pos:", len(pos), ", neg:", len(neg))

# sample small dataset
pos = pos.head(50000)
neg = neg.head(50000)
print("pos:", len(pos), ", neg:", len(neg))

<class 'pandas.core.frame.DataFrame'>
pos: 829277 , neg: 153342
pos: 50000 , neg: 50000


In [9]:
lemmatizer = nltk.WordNetLemmatizer()
stop = stopwords.words('english')
translation = str.maketrans(string.punctuation,' '*len(string.punctuation))

In [85]:
# Yet a more compact way to write the code
def preprocessing(line):
    tokens=[]
    line = str(line).translate(translation)  # Replace punctuation
    line = nltk.word_tokenize(line.lower())  # Tokenize
    
    for t in line:
        # Remove stopwords
        if t not in stop:
            stemmed = lemmatizer.lemmatize(t) # deal with nouns
            stemmed = lemmatizer.lemmatize(stemmed, 'v') # deal with verbs
            tokens.append(stemmed)
    
    return ' '.join(tokens)

In [90]:
test_str = "I bought those movies yesterday and I really loved them!"
preprocessing(test_str)

'buy movie yesterday really love'

In [91]:
# Yet a more modern way to write code
start = time.time()

pos_data = list(map(preprocessing, pos['reviewText']))
neg_data = list(map(preprocessing, neg['reviewText']))

end = time.time()
print(end - start)

116.60849404335022


# Split Training Data & Test Data

In [94]:
data = pos_data + neg_data
# remember this is sampled
labels = np.concatenate((pos['overall'].values,neg['overall'].values))

In [95]:
# Split data into training set and testing set (20:80)
# stratify: make sure pos/neg remains the same in training set and testing set
train_data, test_data, train_labels, test_labels = \
train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=1234)

In [96]:
print("training size = ", len(train_data), "testing size = ", len(test_data))

training size =  80000 testing size =  20000


# Vectorizer

In [None]:
# Push all tokens and compute frequency of words
tokens = [word for line in train_data \
               for word in nltk.word_tokenize(line)]

word_features = nltk.FreqDist(tokens)

topwords = [fpair[0] for fpair in list(word_features.most_common(10000))]

## Tf–idf term weighting

- Tf: term-frequency
- idf: inverse document-frequency
- Tf-idf = $tf(t,d) \times idf(t)$

$$
idf(t) = log{\frac{1 + nd}{1 + df(d, t)}} + 1
$$

![](http://www.onemathematicalcat.org/Math/Algebra_II_obj/Graphics/log_base_gt1.gif)

In [119]:
# Since CountVectorizer and TfidTransformer are often used together
# There is a class named TfidfVectorizer that combine these two steps
tf_vec = TfidfVectorizer(min_df=1)
tf_fit = tf_vec.fit_transform([' '.join(topwords)])
tf_fit

<1x9970 sparse matrix of type '<class 'numpy.float64'>'
	with 9970 stored elements in Compressed Sparse Row format>

# Feature Extraction

In [110]:
# Extract features from training set
# Vocabulary is from topwords
train_features = tf_vec.transform(train_data)

# cnt_train_features = cnt_vec.transform(train_data)
# train_features = tf_trans.transform(cnt_train_features)

In [111]:
# Array[n_train_data * n_features]
train_features.shape

(80000, 9970)

In [112]:
# Extract features from test set
test_features = tf_vec.transform(test_data)

# cnt_test_features = cnt_vec.transform(test_data)
# test_features = tf_trans.transform(cnt_test_features)

In [114]:
# (Uni+Bi)-Gram
bg_tf_vec = TfidfVectorizer(ngram_range=(1,2))
bg_tf_vec.fit([' '.join(topwords)])
bg_train_features = bg_tf_vec.transform(train_data)

bg_train_features.shape

# Array[n_train_data * (uni_gram_features + bi_gram_features)]

(80000, 19939)

In [120]:
# Extract (uni+bi)-gram test features
bg_test_features = bg_tf_vec.transform(test_data)

# Training & Testing Multinomial NB

The multinomial Naive Bayes classifier is suitable for **classification with discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

## Using Uni-Gram features

In [121]:
from sklearn.naive_bayes import MultinomialNB

In [122]:
mnb_model = MultinomialNB()

In [123]:
# Train Model
start = time.time()
mnb_model.fit(train_features, train_labels)
end = time.time()

print("Multinomial NB model trained in %f seconds" % (end-start))

Multinomial NB model trained in 0.209869 seconds


In [124]:
# Predict
pred = mnb_model.predict(test_features)
print(pred)

['pos' 'pos' 'pos' ..., 'pos' 'neg' 'neg']


In [127]:
metrics.accuracy_score?

In [129]:
# Metrics
# mtrics.accuracy_score(y_true, y_pred)
accuracy = metrics.accuracy_score(y_true=test_labels, y_pred=pred)
print(accuracy)

0.79625


In [130]:
# Use keyword arguments to set arguments explicitly
print(metrics.classification_report(y_true=test_labels, y_pred=pred))

             precision    recall  f1-score   support

        neg       0.79      0.81      0.80     10000
        pos       0.80      0.78      0.79     10000

avg / total       0.80      0.80      0.80     20000



## Using Uni-Gram + Bi-Gram features

In [132]:
# Train & test using (uni+bi)-gram features
bg_mnb_model = MultinomialNB()
bg_mnb_model.fit(bg_train_features, train_labels)
bg_pred = bg_mnb_model.predict(bg_test_features)
print(bg_pred)

['pos' 'pos' 'pos' ..., 'pos' 'neg' 'neg']


In [133]:
# Statistics
bg_accuracy = metrics.accuracy_score(bg_pred,test_labels)
print(bg_accuracy)

0.7964


In [134]:
print(metrics.classification_report(y_true=test_labels, y_pred=bg_pred))

             precision    recall  f1-score   support

        neg       0.79      0.81      0.80     10000
        pos       0.80      0.79      0.79     10000

avg / total       0.80      0.80      0.80     20000



# Prediction

In [138]:
# Predict a new sentence
# vectorizer needs to be pre-fitted
# The project function signature should be something like:
# predict_new(sentent: str, vec, model) -> str

def predict_new(sentence: str):
    sentence = preprocessing(sentence)
    features = tf_vec.transform([sentence])
    pred = mnb_model.predict(features)
    return pred[0]

In [142]:
predict_new("I love it")

'pos'

# Save model

In [143]:
import pickle

# Save vectorizer
with open('tf_vec.pkl', 'wb') as pkl_file:
    pickle.dump(tf_vec, pkl_file)
    
# Save model
with open('mnb_model.pkl', 'wb') as pkl_file:
    pickle.dump(mnb_model, pkl_file)