In [1]:
# import neccessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# disable some warnings
import warnings
warnings.simplefilter(action = "ignore")

In [2]:
df = pd.read_csv('perfectly_DigitalMusic.csv',sep="\t")
df.drop(df[df.reviewText_new.isnull()].index, inplace=True)
df = df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1)

In [3]:
df.head(2)

Unnamed: 0,asin,overall,summary,reviewText,reviewText_new
0,5555991584,5.0,Enya's last great album,"It's hard to believe ""Memory of Trees"" came ou...","hard believe ""memory trees"" came 11 years ago;..."
1,5555991584,5.0,Enya at her most elegant,"A clasically-styled and introverted album, Mem...","clasically-styled introverted album, memory tr..."


In [4]:
data=df[['reviewText_new','summary','overall']]
# ignore all 3* reviews
data = data[data["overall"] != 3]
# positive sentiment = 4* or 5* reviews
data["sentiment"] = data["overall"] >= 4
data= data.drop(['overall'], axis=1)

### tf-idf vectorizer

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

reviewText_new, summary, sentiment = (data.reviewText_new, data.summary, data.sentiment)
review_x_train, review_x_test, review_y_train, review_y_test = train_test_split(
    reviewText_new, sentiment, test_size=0.2, random_state=42)
summary_x_train, summary_x_test, summary_y_train, summary_y_test = train_test_split(
    summary, sentiment, test_size=0.2, random_state=42)

# using tf-idf vectorizer
vectorizer = TfidfVectorizer(max_df = 0.5,             #max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:
                                                       #max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
                                                       #max_df = 25 means "ignore terms that appear in more than 25 documents".
                             stop_words='english',
                             sublinear_tf=True, 
                             use_idf=True,
                             lowercase=True)
#*------Hold-out and Cross Validation--------*
# using k-fold validation
REVIEW_x_vectors = vectorizer.fit_transform(reviewText_new)
SUMMARY_x_vectors = vectorizer.fit_transform(summary)

# The fit_transform method applies to feature extraction objects such as CountVectorizer and TfidfTransformer. 
# The "fit" part applies to the feature extractor itself: it determines what features it will base future transformations on. 
#The "transform" part is what takes the data and spits some transformed data back at you. 
REVIEW_x_train_vectors = vectorizer.fit_transform(review_x_train)
REVIEW_x_test_vectors  = vectorizer.transform(review_x_test)

#fit_transform() joins these two steps and is used for the initial fitting of parameters on the training set xx, but it also returns a transformed x′x′. Internally, it just calls first fit() and then transform() on the same data.
SUMMARY_x_train_vectors = vectorizer.fit_transform(summary_x_train)
SUMMARY_x_test_vectors  = vectorizer.transform(summary_x_test)


# Time to classify by applying some models

## Doing with meaningful_review feature

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from operator import itemgetter

names = ["MultinomialNB", "Linear SVM"]
classifiers = [
    MultinomialNB(),
    SVC(kernel="linear", C=0.025)
    ]

# iterate over classifiers
results = {}
k_fold_results = {}

for name, clf in zip(names, classifiers):
    print ("Training " + name + " classifier...")
    print("Doing Houd-out...")
    clf.fit(REVIEW_x_train_vectors, review_y_train)
    score = clf.score(REVIEW_x_test_vectors, review_y_test)
    results[name] = score
    print("Doing K-fold cross validation...")
    value_cross_val_score = cross_val_score(clf, REVIEW_x_vectors, sentiment, cv=5).mean()
    k_fold_results[name]= value_cross_val_score

print ("---------------------------")
print ("Evaluation results")
print ("---------------------------")

# sorting results and print out
sorted(results.items(), key=itemgetter(1))
for name in results:
    print ("Hold-out: ",name + " accuracy: %0.3f" % results[name])
    print ("Cross validation 5-fold: ",name + " accuracy: %0.3f" % k_fold_results[name])

Training MultinomialNB classifier...
Doing Houd-out...
Doing K-fold cross validation...
Training Linear SVM classifier...
Doing Houd-out...


In [12]:
from sklearn.naive_bayes import MultinomialNB
# Train the model
REVIEW_gauss_clf = MultinomialNB()
REVIEW_gauss_clf.fit(REVIEW_x_train_vectors, review_y_train)
print('Value after using hold-out: ', REVIEW_gauss_clf.score(REVIEW_x_test_vectors, review_y_test))

from sklearn.model_selection import cross_val_score
print('Value after using cross validation with 10-fold:', 
      cross_val_score(REVIEW_gauss_clf, REVIEW_x_vectors, sentiment, cv=10).mean())

Value after using hold-out:  0.904005524862
Value after using cross validation with 10-fold: 0.899321460302


## Doing with summary feature

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

names = ["MultinomialNB", "Linear SVM"]
classifiers = [
    MultinomialNB(),
    SVC(kernel="linear", C=0.025)
    ]

# iterate over classifiers
results = {}
k_fold_results = {}

for name, clf in zip(names, classifiers):
    print ("Training " + name + " classifier...")
    print("Doing Houd-out...")
    clf.fit(SUMMARY_x_train_vectors, summary_y_train)
    score = clf.score(SUMMARY_x_test_vectors, summary_y_test)
    results[name] = score
    print("Doing K-fold cross validation...")
    value_cross_val_score = cross_val_score(SUMMARY_gauss_clf, SUMMARY_x_vectors, sentiment, cv=5).mean()
    k_fold_results[name]= value_cross_val_score

print ("---------------------------")
print ("Evaluation results")
print ("---------------------------")

# sorting results and print out
sorted(results.items(), key=itemgetter(1))
for name in results:
    print ("Hold-out: ",name + " accuracy: %0.3f" % results[name])
    print ("Cross validation 5-fold: ",name + " accuracy: %0.3f" % k_fold_results[name])

In [16]:
from sklearn.naive_bayes import MultinomialNB
# Train the model
SUMMARY_gauss_clf = MultinomialNB()
SUMMARY_gauss_clf.fit(SUMMARY_x_train_vectors, summary_y_train)
print('Value after using hold-out: ', SUMMARY_gauss_clf.score(SUMMARY_x_test_vectors, summary_y_test))

from sklearn.model_selection import cross_val_score
print('Value after using cross validation with 10-fold:', 
      cross_val_score(SUMMARY_gauss_clf, SUMMARY_x_vectors, sentiment, cv=10).mean())

Value after using hold-out:  0.913328729282
Value after using cross validation with 10-fold: 0.907591890467
