# Import libraries

In [24]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
import itertools
import matplotlib.pyplot as plt
import time
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /Users/kappa/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Load dataset

In [2]:
data = "dataset/Cell_Phones_and_Accessories_5.json.gz"
df = pd.read_json(data, lines = True, compression = "gzip")
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"08 4, 2014",A24E3SXTC62LJI,7508492919,{'Color:': ' Bling'},Claudia Valdivia,Looks even better in person. Be careful to not...,Can't stop won't stop looking at it,1407110400,,
1,5,True,"02 12, 2014",A269FLZCB4GIPV,7508492919,,sarah ponce,When you don't want to spend a whole lot of ca...,1,1392163200,,
2,3,True,"02 8, 2014",AB6CHQWHZW4TV,7508492919,,Kai,"so the case came on time, i love the design. I...",Its okay,1391817600,,
3,2,True,"02 4, 2014",A1M117A53LEI8,7508492919,,Sharon Williams,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,CASE,1391472000,,
4,4,True,"02 3, 2014",A272DUT8M88ZS8,7508492919,,Bella Rodriguez,"I liked it because it was cute, but the studs ...",Cute!,1391385600,,


In [3]:
df.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')

How many reviews are verified?

In [4]:
df.groupby('verified')['unixReviewTime'].count()

verified
False    141113
True     987324
Name: unixReviewTime, dtype: int64

#### Open point: how do we address it? (If we want to do it, of course :) )
# Filtering the dataset
The first operation we will perform is the removal of punctuation characters and lowercase all letters: these operation will be useful for reducing the number of features

In [5]:
df['reviewText'] = df['reviewText'].str.replace('[.,;:;!?]+', '')

In [6]:
df['reviewText'] = df['reviewText'].str.lower()

In [7]:
X = df.reviewText[df.reviewText.notnull()].values
y = df.overall[df.reviewText.notnull()].values

In [8]:
X.shape

(1127672,)

In [9]:
y.shape

(1127672,)

How are the values split?

In [10]:
star_value, counts = np.unique(y, return_counts=True)
dict(zip(star_value, counts))

{1: 81506, 2: 57166, 3: 98214, 4: 184351, 5: 706435}

In [11]:
dict(zip(star_value, counts/len(y)))

{1: 0.07227810923743784,
 2: 0.05069381876999695,
 3: 0.08709447427975511,
 4: 0.16347927411516824,
 5: 0.6264543235976419}

Imbalanced dataset!
We'd like to split the ratings as follows:
- 1,2 and 3 will be considered negative
- 4 and 5 will be considered positive

The main reason why we'd like to proceed as follows is that, on Amazon, the most restrictive filter is the "4 star +" one and is used for filtering the returned results: a given vendor would like to have her/his products shown after this phase.

In [12]:
def int2sent(n):
    if n >= 4:
        return "positive"
    if n <= 3:
        return "negative"

In [13]:
sentiment_is_positive = y > 3

In [14]:
sentiment_categories, counts = np.unique(sentiment_is_positive, return_counts=True)
dict(zip(sentiment_categories, counts))

{False: 236886, True: 890786}

In [15]:
dict(zip(sentiment_categories, counts/len(sentiment_is_positive)))

{False: 0.2100664022871899, True: 0.7899335977128101}

# Text Preprocessing

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Train-test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, sentiment_is_positive, test_size=0.33, random_state=42)

In [18]:
X_train[0]

'this case is nice well-made and fits snugly on the phone  i purchased this case along with another brand that was 100% tpu material  while i can see that this case is very scratch resistant i think i prefer a case that is 100% tpu because of improved grip and non-slip properties  the hard polycarbonate material on the back of this case will let the phone slide on a smooth surface like a table  i just prefer a case that has more of a "grippy" back so that the phone doesn\'t slide on a smooth surface  but that\'s just my preference'

In [19]:
y_train[0]

True

## Tokenization

In [20]:
from nltk.tokenize import word_tokenize

In [25]:
X_train_tokenized = [word_tokenize(sentence) for sentence in X_train]

In [26]:
X_test_tokenized = [word_tokenize(sentence) for sentence in X_test]

In [27]:
X_train[0]

'this case is nice well-made and fits snugly on the phone  i purchased this case along with another brand that was 100% tpu material  while i can see that this case is very scratch resistant i think i prefer a case that is 100% tpu because of improved grip and non-slip properties  the hard polycarbonate material on the back of this case will let the phone slide on a smooth surface like a table  i just prefer a case that has more of a "grippy" back so that the phone doesn\'t slide on a smooth surface  but that\'s just my preference'

In [28]:
X_train_tokenized[0]

['this',
 'case',
 'is',
 'nice',
 'well-made',
 'and',
 'fits',
 'snugly',
 'on',
 'the',
 'phone',
 'i',
 'purchased',
 'this',
 'case',
 'along',
 'with',
 'another',
 'brand',
 'that',
 'was',
 '100',
 '%',
 'tpu',
 'material',
 'while',
 'i',
 'can',
 'see',
 'that',
 'this',
 'case',
 'is',
 'very',
 'scratch',
 'resistant',
 'i',
 'think',
 'i',
 'prefer',
 'a',
 'case',
 'that',
 'is',
 '100',
 '%',
 'tpu',
 'because',
 'of',
 'improved',
 'grip',
 'and',
 'non-slip',
 'properties',
 'the',
 'hard',
 'polycarbonate',
 'material',
 'on',
 'the',
 'back',
 'of',
 'this',
 'case',
 'will',
 'let',
 'the',
 'phone',
 'slide',
 'on',
 'a',
 'smooth',
 'surface',
 'like',
 'a',
 'table',
 'i',
 'just',
 'prefer',
 'a',
 'case',
 'that',
 'has',
 'more',
 'of',
 'a',
 '``',
 'grippy',
 "''",
 'back',
 'so',
 'that',
 'the',
 'phone',
 'does',
 "n't",
 'slide',
 'on',
 'a',
 'smooth',
 'surface',
 'but',
 'that',
 "'s",
 'just',
 'my',
 'preference']

In [26]:
from joblib import dump, load

In [27]:
#dump(X_train_tokenized, 'X_train_tokenized.joblib')
#dump(X_test_tokenized, 'X_test_tokenized.joblib')

In [28]:
len(X_train_tokenized)

755540

## Stop words removal

In [None]:
from nltk.corpus import stopwords
stopws = stopwords.words("english")

Some words, however, have an important meaning for our task:

In [None]:
np.array(stopws[-36:])

In [None]:
stopws = stopws[:-36]

Other words with a useful meaning:

In [None]:
words2save = ["but", "while", "against", "not", "only", "very", 'don', "don't"]

In [None]:
for w in words2save:
    stopws.remove(w)

## Stemming
Because it is necessary to install Visual C++ before installing the package 'pyStemmer' via pip https://support.microsoft.com/it-it/help/2977003/the-latest-supported-visual-c-downloads I will use nltk library even though it is less efficient
### PorterStemmer

In [29]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
X_train_tokenized_stemmed_ps = []
for sentence in X_train_tokenized:
    X_train_tokenized_stemmed_ps.append([ps.stem(word) for word in sentence])

In [None]:
X_train_tokenized_stemmed_ps[0]

In [None]:
X_test_tokenized_stemmed_ps = []
for sentence in X_test_tokenized:
    X_test_tokenized_stemmed_ps.append([ps.stem(word) for word in sentence])

### Lancaster stemmer

In [None]:
from nltk.stem import LancasterStemmer
ls_stemmer = LancasterStemmer()

In [None]:
X_train_tokenized_stemmed_ls = []
for sentence in X_train_tokenized:
    X_train_tokenized_stemmed_ls.append([ls_stemmer.stem(word) for word in sentence])

In [None]:
X_test_tokenized_stemmed_ls = []
for sentence in X_test_tokenized:
    X_test_tokenized_stemmed_ls.append([ls_stemmer.stem(word) for word in sentence])

## TF-IDF
For computing TF-IDF matrix we need to rebuild the sentences. Let's do it:

### Porter stemmer

In [None]:
X_train_tokenized_ps_sent = []
for sentence in X_train_tokenized_stemmed_ps:
    X_train_tokenized_ps_sent.append(" ".join(sentence))
X_train_tokenized_ps_sent[0]

In [None]:
X_test_tokenized_ps_sent = []
for sentence in X_test_tokenized_stemmed_ps:
    X_test_tokenized_ps_sent.append(" ".join(sentence))

### LancasterStemmer

In [None]:
X_train_tokenized_ls_sent = []
for sentence in X_train_tokenized_stemmed_ls:
    X_train_tokenized_ls_sent.append(" ".join(sentence))
X_train_tokenized_ls_sent[0]

In [None]:
X_test_tokenized_ls_sent = []
for sentence in X_test_tokenized_stemmed_ls:
    X_test_tokenized_ls_sent.append(" ".join(sentence))

## Compute TF-IDF matrix
For avoiding a high number of features I will set the following two constrainst:
- A term should have a frequency >= 5 in the entire corpus
- The best 50 000 features are kept

### Porter stemmer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect_ps = TfidfVectorizer(min_df= 5, max_features = 50000)
X_train_tfidf_ps = tfidf_vect_ps.fit_transform(X_train_tokenized_ps_sent)

In [None]:
print(X_train_tfidf_ps)

In [None]:
dump(tfidf_vect_ps, 'tfidf_vect_ps.joblib')

In [None]:
X_test_tfidf_ps = tfidf_vect_ps.transform(X_test_tokenized_ps_sent)

In [None]:
len(tfidf_vect_ps.get_feature_names())

### Lancaster stemmer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect_ls = TfidfVectorizer(min_df= 5, max_features = 50000)
X_train_tfidf_ls = tfidf_vect_ls.fit_transform(X_train_tokenized_ls_sent)

In [None]:
dump(tfidf_vect_ls, 'tfidf_vect_ls.joblib')

In [None]:
X_test_tfidf_ls = tfidf_vect_ls.transform(X_test_tokenized_ls_sent)

In [None]:
len(tfidf_vect_ls.get_feature_names())

### Store stemmed dataset

In [None]:
dump(X_train_tfidf_ps, 'X_train_tfidf_ps.joblib')
dump(X_test_tfidf_ps, 'X_test_tfidf_ps.joblib')

In [None]:
dump(X_train_tfidf_ls, 'X_train_tfidf_ls.joblib')
dump(X_test_tfidf_ls, 'X_test_tfidf_ls.joblib')

# Classifiers
## Multinomial Naive-Bayes
### Porter

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf_ps, y_train)

### Evaluate performances

In [None]:
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score

In [None]:
train_score = clf.score(X_train_tfidf_ps, y_train) # Train Accuracy
test_score = clf.score(X_test_tfidf_ps, y_test)    # Test Accuracy

In [None]:
print(train_score)

In [None]:
print(test_score)

In [None]:
predictions = clf.predict(X_test_tfidf_ps)
prec = precision_score(y_test, predictions) # Precision
rec = recall_score(y_test, predictions) # Recall
f1 = f1_score(y_test, predictions) # F1
f2 = fbeta_score(y_test, predictions, 2) # F2
cm = confusion_matrix(y_test, predictions)

In [None]:
cm

In [None]:
proba = clf.predict_proba(X_test_tfidf_ps)
precision, recall, pr_thresholds = precision_recall_curve(y_test, proba[:,1])

In [None]:
auc_score = auc(recall, precision)

In [None]:
scores_strings = ["Train Accuracy", "Test Accuracy", "Test Precision",
                  "Test Recall", "F1", "F2", "P/R AUC"]
scores = [train_score, test_score, prec, rec, f1, f2, auc_score]
print(("{:20s} {:.5f}\n"*7)[:-1].format(*itertools.chain(*zip(scores_strings, scores))))

In [None]:
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve: AUC=%0.2f' % auc_score)
plt.legend(loc="lower left")
plt.show()

In [None]:
print(classification_report(y_test,predictions))

### Lancaster

In [None]:
clf = MultinomialNB()
clf.fit(X_train_tfidf_ls, y_train)

In [None]:
train_score = clf.score(X_train_tfidf_ls, y_train) # Train Accuracy
test_score = clf.score(X_test_tfidf_ls, y_test)    # Test Accuracy

In [None]:
train_score

In [None]:
test_score

In [None]:
predictions = clf.predict(X_test_tfidf_ls)
prec = precision_score(y_test, predictions) # Precision
rec = recall_score(y_test, predictions) # Recall
f1 = f1_score(y_test, predictions) # F1
f2 = fbeta_score(y_test, predictions, 2) # F2
cm = confusion_matrix(y_test, predictions)
proba = clf.predict_proba(X_test_tfidf_ls)
precision, recall, pr_thresholds = precision_recall_curve(y_test, proba[:,1])

In [None]:
auc_score = auc(recall, precision)

In [None]:
scores_strings = ["Train Accuracy", "Test Accuracy", "Test Precision",
                  "Test Recall", "F1", "F2", "P/R AUC"]
scores = [train_score, test_score, prec, rec, f1, f2, auc_score]
print(("{:20s} {:.5f}\n"*7)[:-1].format(*itertools.chain(*zip(scores_strings, scores))))

In [None]:
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve: AUC=%0.2f' % auc_score)
plt.legend(loc="lower left")
plt.show()

In [None]:
print(classification_report(y_test,predictions))

## Random forest
### Porter

In [74]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, verbose=2)
clf.fit(X_train_tfidf_ps, y_train) # it takes around 30 minutes

building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 23.6min


building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 33.4min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=42, verbose=2, warm_start=False)

In [75]:
train_score = clf.score(X_train_tfidf_ps, y_train) # Train Accuracy
test_score = clf.score(X_test_tfidf_ps, y_test)    # Test Accuracy

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    8.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   11.5s finished
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    5.7s finished


In [76]:
predictions = clf.predict(X_test_tfidf_ps)

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    5.7s finished


In [77]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

      False       0.83      0.49      0.62     78224
       True       0.88      0.97      0.92    293908

avg / total       0.87      0.87      0.86    372132



Results are more encouraging! The problem is that it's way slower than Multinomial NB.
## TruncatedSVD
The X_train vector has around 20k features: for speeding up the training phase it may be good to use dimensionality reduction methods. Their goal is to preserve "expressive power" while reducing dataset dimensionality.
Because the TFIDF matrix is a sparse one, one of the best method for performing dimensionality reduction is "TruncatedSVD"

In [78]:
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(n_components=500, random_state=42)
X_train_tfidf_ps_svd = tsvd.fit_transform(X_train_tfidf_ps)
X_test_tfidf_ps_svd = tsvd.transform(X_test_tfidf_ps)

In [79]:
X_train_tfidf_ps

<755540x21763 sparse matrix of type '<class 'numpy.float64'>'
	with 14537542 stored elements in Compressed Sparse Row format>

In [80]:
print("train with old features: ",np.array(X_train_tfidf_ps).shape)
print("train with new features:" ,np.array(X_train_tfidf_ps_svd).shape)

train with old features:  ()
train with new features: (755540, 500)


### Store SVD-transformed dataset

In [81]:
dump(X_train_tfidf_ps_svd, 'X_train_tfidf_ps_svd.joblib')
dump(X_test_tfidf_ps_svd, 'X_test_tfidf_ps_svd.joblib')

['X_test_tfidf_ps_pca.joblib']

### Classifiers
Multinomial NB won't be used for the following reasons: https://stackoverflow.com/questions/24169238/dealing-with-negative-values-in-sklearn-multinomialnb
#### Randomforest

In [82]:
from sklearn.naive_bayes import MultinomialNB

clf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, verbose=2)
clf.fit(X_train_tfidf_ps_svd, y_train)

building tree 1 of 50building tree 2 of 50
building tree 3 of 50
building tree 4 of 50

building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.7min


building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=42, verbose=2, warm_start=False)

In [83]:
predictions = clf.predict(X_test_tfidf_ps_svd)
print(classification_report(y_test,predictions))

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    2.8s


             precision    recall  f1-score   support

      False       0.82      0.32      0.46     78224
       True       0.85      0.98      0.91    293908

avg / total       0.84      0.84      0.81    372132



[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    4.1s finished
