# Import libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import itertools
import matplotlib.pyplot as plt
import time
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Khaled\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Khaled\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load dataset

In [2]:
data = "dataset/Cell_Phones_and_Accessories_5.json.gz"
df = pd.read_json(data, lines = True, compression = "gzip")
df.head()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,7508492919,,5,Looks even better in person. Be careful to not...,"08 4, 2014",A24E3SXTC62LJI,Claudia Valdivia,{'Color:': ' Bling'},Can't stop won't stop looking at it,1407110400,True,
1,7508492919,,5,When you don't want to spend a whole lot of ca...,"02 12, 2014",A269FLZCB4GIPV,sarah ponce,,1,1392163200,True,
2,7508492919,,3,"so the case came on time, i love the design. I...","02 8, 2014",AB6CHQWHZW4TV,Kai,,Its okay,1391817600,True,
3,7508492919,,2,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,"02 4, 2014",A1M117A53LEI8,Sharon Williams,,CASE,1391472000,True,
4,7508492919,,4,"I liked it because it was cute, but the studs ...","02 3, 2014",A272DUT8M88ZS8,Bella Rodriguez,,Cute!,1391385600,True,


In [3]:
df.columns

Index(['asin', 'image', 'overall', 'reviewText', 'reviewTime', 'reviewerID',
       'reviewerName', 'style', 'summary', 'unixReviewTime', 'verified',
       'vote'],
      dtype='object')

How many reviews are verified?

In [4]:
df.groupby('verified')['unixReviewTime'].count()

verified
False    141113
True     987324
Name: unixReviewTime, dtype: int64

#### Open point: how do we address it? (If we want to do it, of course :) )
# Filtering the dataset
The first operation we will perform is the removal of punctuation characters and lowercase all letters: these operation will be useful for reducing the number of features

In [5]:
df['reviewText'] = df['reviewText'].str.replace('[.,;:;!?]+', '')

In [6]:
df['reviewText'] = df['reviewText'].str.lower()

In [7]:
X = df.reviewText[df.reviewText.notnull()].values
y = df.overall[df.reviewText.notnull()].values

In [8]:
X.shape

(1127672,)

In [9]:
y.shape

(1127672,)

How are the values split?

In [10]:
star_value, counts = np.unique(y, return_counts=True)
dict(zip(star_value, counts))

{1: 81506, 2: 57166, 3: 98214, 4: 184351, 5: 706435}

In [11]:
dict(zip(star_value, counts/len(y)))

{1: 0.07227810923743784,
 2: 0.05069381876999695,
 3: 0.08709447427975511,
 4: 0.16347927411516824,
 5: 0.6264543235976419}

Imbalanced dataset!
We'd like to split the ratings as follows:
- 1,2 and 3 will be considered negative
- 4 and 5 will be considered positive

The main reason why we'd like to proceed as follows is that, on Amazon, the most restrictive filter is the "4 star +" one and is used for filtering the returned results: a given vendor would like to have her/his products shown after this phase.

In [12]:
def int2sent(n):
    if n >= 4:
        return "positive"
    if n <= 3:
        return "negative"

In [13]:
sentiment_is_positive = y > 3

In [14]:
sentiment_categories, counts = np.unique(sentiment_is_positive, return_counts=True)
dict(zip(sentiment_categories, counts))

{False: 236886, True: 890786}

In [15]:
dict(zip(sentiment_categories, counts/len(sentiment_is_positive)))

{False: 0.2100664022871899, True: 0.7899335977128101}

# Text Preprocessing

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Train-test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, sentiment_is_positive, test_size=0.33, random_state=42)

In [18]:
X_train[0]

'this case is nice well-made and fits snugly on the phone  i purchased this case along with another brand that was 100% tpu material  while i can see that this case is very scratch resistant i think i prefer a case that is 100% tpu because of improved grip and non-slip properties  the hard polycarbonate material on the back of this case will let the phone slide on a smooth surface like a table  i just prefer a case that has more of a "grippy" back so that the phone doesn\'t slide on a smooth surface  but that\'s just my preference'

In [19]:
y_train[0]

True

## Tokenization

In [20]:
from nltk.tokenize import word_tokenize

In [None]:
X_train_tokenized = [word_tokenize(sentence) for sentence in X_train]

In [None]:
X_test_tokenized = [word_tokenize(sentence) for sentence in X_test]

In [None]:
X_train[0]

In [None]:
X_train_tokenized[0]

In [None]:
from joblib import dump, load

In [None]:
#dump(X_train_tokenized, 'X_train_tokenized.joblib')
#dump(X_test_tokenized, 'X_test_tokenized.joblib')

In [None]:
len(X_train_tokenized)

## Stop words removal

In [None]:
from nltk.corpus import stopwords
stopws = stopwords.words("english")

Some words, however, have an important meaning for our task:

In [None]:
np.array(stopws[-36:])

In [None]:
stopws = stopws[:-36]

Other words with a useful meaning:

In [None]:
words2save = ["but", "while", "against", "not", "only", "very", 'don', "don't"]

In [None]:
for w in words2save:
    stopws.remove(w)

In [None]:
X_train_tokenized_stop = []
for sentence in X_train_tokenized:
    filtered_sentence = [word for word in sentence if word not in stopws]
    X_train_tokenized_stop.append(filtered_sentence)

In [None]:
X_train_tokenized_stop[0]

In [None]:
X_test_tokenized_stop = []
for sentence in X_test_tokenized:
    filtered_sentence = [word for word in sentence if word not in stopws]
    X_test_tokenized_stop.append(filtered_sentence)

In [None]:
X_test_tokenized_stop[0]

## Stemming
Because it is necessary to install Visual C++ before installing the package 'pyStemmer' via pip https://support.microsoft.com/it-it/help/2977003/the-latest-supported-visual-c-downloads I will use nltk library even though it is less efficient
### PorterStemmer

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
X_train_tokenized_stemmed_ps = []
for sentence in X_train_tokenized:
    X_train_tokenized_stemmed_ps.append([ps.stem(word) for word in sentence])

In [None]:
X_train_tokenized_stemmed_ps[0]

In [None]:
X_test_tokenized_stemmed_ps = []
for sentence in X_test_tokenized:
    X_test_tokenized_stemmed_ps.append([ps.stem(word) for word in sentence])

### Lancaster stemmer

In [None]:
from nltk.stem import LancasterStemmer
ls_stemmer = LancasterStemmer()

In [None]:
X_train_tokenized_stemmed_ls = []
for sentence in X_train_tokenized:
    X_train_tokenized_stemmed_ls.append([ls_stemmer.stem(word) for word in sentence])

In [None]:
X_test_tokenized_stemmed_ls = []
for sentence in X_test_tokenized:
    X_test_tokenized_stemmed_ls.append([ls_stemmer.stem(word) for word in sentence])

## TF-IDF
For computing TF-IDF matrix we need to rebuild the sentences. Let's do it:

### Porter stemmer

In [None]:
X_train_tokenized_ps_sent = []
for sentence in X_train_tokenized_stemmed_ps:
    X_train_tokenized_ps_sent.append(" ".join(sentence))
X_train_tokenized_ps_sent[0]

In [None]:
X_test_tokenized_ps_sent = []
for sentence in X_test_tokenized_stemmed_ps:
    X_test_tokenized_ps_sent.append(" ".join(sentence))

### LancasterStemmer

In [None]:
X_train_tokenized_ls_sent = []
for sentence in X_train_tokenized_stemmed_ls:
    X_train_tokenized_ls_sent.append(" ".join(sentence))
X_train_tokenized_ls_sent[0]

In [None]:
X_test_tokenized_ls_sent = []
for sentence in X_test_tokenized_stemmed_ls:
    X_test_tokenized_ls_sent.append(" ".join(sentence))

## Compute TF-IDF matrix
For avoiding a high number of features I will set the following two constrainst:
- A term should have a frequency >= 5 in the entire corpus
- The best 50 000 features are kept

### Porter stemmer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect_ps = TfidfVectorizer(min_df= 5, max_features = 50000)
X_train_tfidf_ps = tfidf_vect_ps.fit_transform(X_train_tokenized_ps_sent)

In [None]:
print(X_train_tfidf_ps)

In [None]:
dump(tfidf_vect_ps, 'tfidf_vect_ps.joblib')

In [None]:
X_test_tfidf_ps = tfidf_vect_ps.transform(X_test_tokenized_ps_sent)

In [None]:
len(tfidf_vect_ps.get_feature_names())

### Lancaster stemmer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect_ls = TfidfVectorizer(min_df= 5, max_features = 50000)
X_train_tfidf_ls = tfidf_vect_ls.fit_transform(X_train_tokenized_ls_sent)

In [None]:
dump(tfidf_vect_ls, 'tfidf_vect_ls.joblib')

In [None]:
X_test_tfidf_ls = tfidf_vect_ls.transform(X_test_tokenized_ls_sent)

In [None]:
len(tfidf_vect_ls.get_feature_names())

### Store stemmed dataset

In [None]:
dump(X_train_tfidf_ps, 'X_train_tfidf_ps.joblib')
dump(X_test_tfidf_ps, 'X_test_tfidf_ps.joblib')

In [None]:
dump(X_train_tfidf_ls, 'X_train_tfidf_ls.joblib')
dump(X_test_tfidf_ls, 'X_test_tfidf_ls.joblib')

# Classifiers
## Multinomial Naive-Bayes
### Porter

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf_ps, y_train)

### Evaluate performances

In [None]:
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score

In [None]:
train_score = clf.score(X_train_tfidf_ps, y_train) # Train Accuracy
test_score = clf.score(X_test_tfidf_ps, y_test)    # Test Accuracy

In [None]:
print(train_score)

In [None]:
print(test_score)

In [None]:
predictions = clf.predict(X_test_tfidf_ps)
prec = precision_score(y_test, predictions) # Precision
rec = recall_score(y_test, predictions) # Recall
f1 = f1_score(y_test, predictions) # F1
f2 = fbeta_score(y_test, predictions, 2) # F2
cm = confusion_matrix(y_test, predictions)

In [None]:
cm

In [None]:
proba = clf.predict_proba(X_test_tfidf_ps)
precision, recall, pr_thresholds = precision_recall_curve(y_test, proba[:,1])

In [None]:
auc_score = auc(recall, precision)

In [None]:
scores_strings = ["Train Accuracy", "Test Accuracy", "Test Precision",
                  "Test Recall", "F1", "F2", "P/R AUC"]
scores = [train_score, test_score, prec, rec, f1, f2, auc_score]
print(("{:20s} {:.5f}\n"*7)[:-1].format(*itertools.chain(*zip(scores_strings, scores))))

In [None]:
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve: AUC=%0.2f' % auc_score)
plt.legend(loc="lower left")
plt.show()

In [None]:
print(classification_report(y_test,predictions))

### Lancaster

In [None]:
clf = MultinomialNB()
clf.fit(X_train_tfidf_ls, y_train)

In [None]:
train_score = clf.score(X_train_tfidf_ls, y_train) # Train Accuracy
test_score = clf.score(X_test_tfidf_ls, y_test)    # Test Accuracy

In [None]:
train_score

In [None]:
test_score

In [None]:
predictions = clf.predict(X_test_tfidf_ls)
prec = precision_score(y_test, predictions) # Precision
rec = recall_score(y_test, predictions) # Recall
f1 = f1_score(y_test, predictions) # F1
f2 = fbeta_score(y_test, predictions, 2) # F2
cm = confusion_matrix(y_test, predictions)
proba = clf.predict_proba(X_test_tfidf_ls)
precision, recall, pr_thresholds = precision_recall_curve(y_test, proba[:,1])

In [None]:
auc_score = auc(recall, precision)

In [None]:
scores_strings = ["Train Accuracy", "Test Accuracy", "Test Precision",
                  "Test Recall", "F1", "F2", "P/R AUC"]
scores = [train_score, test_score, prec, rec, f1, f2, auc_score]
print(("{:20s} {:.5f}\n"*7)[:-1].format(*itertools.chain(*zip(scores_strings, scores))))

In [None]:
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve: AUC=%0.2f' % auc_score)
plt.legend(loc="lower left")
plt.show()

In [None]:
print(classification_report(y_test,predictions))

## Random forest
### Porter

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, verbose=2)
clf.fit(X_train_tfidf_ps, y_train) # it takes around 30 minutes

In [None]:
train_score = clf.score(X_train_tfidf_ps, y_train) # Train Accuracy
test_score = clf.score(X_test_tfidf_ps, y_test)    # Test Accuracy

In [None]:
predictions = clf.predict(X_test_tfidf_ps)

In [None]:
print(classification_report(y_test,predictions))

Results are more encouraging! The problem is that it's way slower than Multinomial NB.
## TruncatedSVD
The X_train vector has around 20k features: for speeding up the training phase it may be good to use dimensionality reduction methods. Their goal is to preserve "expressive power" while reducing dataset dimensionality.
Because the TFIDF matrix is a sparse one, one of the best method for performing dimensionality reduction is "TruncatedSVD"

In [None]:
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(n_components=500, random_state=42)
X_train_tfidf_ps_svd = tsvd.fit_transform(X_train_tfidf_ps)
X_test_tfidf_ps_svd = tsvd.transform(X_test_tfidf_ps)

In [None]:
X_train_tfidf_ps

In [None]:
print("train with old features: ",np.array(X_train_tfidf_ps).shape)
print("train with new features:" ,np.array(X_train_tfidf_ps_svd).shape)

### Store SVD-transformed dataset

In [None]:
dump(X_train_tfidf_ps_svd, 'X_train_tfidf_ps_svd.joblib')
dump(X_test_tfidf_ps_svd, 'X_test_tfidf_ps_svd.joblib')

### Classifiers
Multinomial NB won't be used for the following reasons: https://stackoverflow.com/questions/24169238/dealing-with-negative-values-in-sklearn-multinomialnb
#### Randomforest

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, verbose=2)
clf.fit(X_train_tfidf_ps_svd, y_train)

In [None]:
predictions = clf.predict(X_test_tfidf_ps_svd)
print(classification_report(y_test,predictions))

## Use sentences without stop words

### Stemming

In [None]:
X_train_tokenized_stemmed_ps_stop = []
for sentence in X_train_tokenized_stop:
    X_train_tokenized_stemmed_ps_stop.append([ps.stem(word) for word in sentence])

In [None]:
X_test_tokenized_stemmed_ps_stop = []
for sentence in X_test_tokenized_stop:
    X_test_tokenized_stemmed_ps_stop.append([ps.stem(word) for word in sentence])

### TF-IDF

In [None]:
X_train_tokenized_ps_sent_stop = []
for sentence in X_train_tokenized_stemmed_ps_stop:
    X_train_tokenized_ps_sent_stop.append(" ".join(sentence))
X_train_tokenized_ps_sent_stop[0]

In [None]:
X_test_tokenized_ps_sent_stop = []
for sentence in X_test_tokenized_stemmed_ps_stop:
    X_test_tokenized_ps_sent_stop.append(" ".join(sentence))
X_test_tokenized_ps_sent_stop[0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect_ps_stop = TfidfVectorizer(min_df= 5, max_features = 50000)
X_train_tfidf_ps_stop = tfidf_vect_ps_stop.fit_transform(X_train_tokenized_ps_sent_stop)

In [None]:
X_test_tfidf_ps_stop = tfidf_vect_ps_stop.transform(X_test_tokenized_ps_sent_stop)

### Multinomial NB

In [None]:
clf = MultinomialNB()
clf.fit(X_train_tfidf_ps_stop, y_train)

In [None]:
X_train_tfidf_ps_stop.shape

In [None]:
X_test_tfidf_ps_stop.shape

In [None]:
predictions = clf.predict(X_test_tfidf_ps_stop)
prec = precision_score(y_test, predictions) # Precision
rec = recall_score(y_test, predictions) # Recall
f1 = f1_score(y_test, predictions) # F1
f2 = fbeta_score(y_test, predictions, 2) # F2
cm = confusion_matrix(y_test, predictions)
proba = clf.predict_proba(X_test_tfidf_ps_stop)
precision, recall, pr_thresholds = precision_recall_curve(y_test, proba[:,1])

In [None]:
auc_score = auc(recall, precision)
scores_strings = ["Train Accuracy", "Test Accuracy", "Test Precision",
                  "Test Recall", "F1", "F2", "P/R AUC"]
scores = [train_score, test_score, prec, rec, f1, f2, auc_score]
print(("{:20s} {:.5f}\n"*7)[:-1].format(*itertools.chain(*zip(scores_strings, scores))))

In [None]:
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve: AUC=%0.2f' % auc_score)
plt.legend(loc="lower left")
plt.show()

In [None]:
print(classification_report(y_test,predictions))