In [31]:
import nltk, re, csv,time
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from nltk.corpus import stopwords

from gensim import corpora

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import pickle
import pandas as pd
import re
import statistics
import random

In [2]:
main_df = pd.read_csv('data/preprocessed_reviewinfo.csv')
main_df['processed_content'] = ''
main_df.drop(['Title', 'Author', 'ReviewID', 'Overall', 'Date', 'polarity'], axis=1, inplace=True)

## Choosing and building classification models

In [None]:
### DO NOT RUN THIS CODE IF YOU HAVE NO TIME ### REFER TO THE CODE BELOW TO GET THE PROCESSED REVIEWS ###
# Processsing for stopwords, alphabetic words, Stemming 
start = time.time()

# lower case all the words in the doc
main_df['Content'] = [doc.lower() for doc in main_df['Content']]

#tokenize the words
main_df['Content']= [word_tokenize(doc) for doc in main_df['Content']]

# Remove all non-words tokens
main_df['Content'] = [[w for w in doc if re.search('^[a-z]+$',w)] for doc in main_df['Content']]

# Removing stop words
stop_list = nltk.corpus.stopwords.words('english')
main_df['Content'] = [[w for w in doc if w not in stop_list] for doc in main_df['Content']]

# Stemming on words
stemmer = PorterStemmer()
stemmed_doc = [[stemmer.stem(w) for w in doc] for doc in main_df['Content']]

#store the processed doc
index = 0
for doc in stemmed_doc:
    main_df.loc[index,'processed_content'] = str(doc)
    index = index + 1

end = time.time()
print("time taken: " + str((end - start)) + " secs")

# Export out the processed reviews to avoid processing again
main_df.to_csv(r'data/processed_reviews.csv', index=False)

In [32]:
# Splitting the dataset into train and validate (via reload of data from processed reviews)
main_df = pd.read_csv('data/processed_reviews.csv')

In [50]:
print(main_df.columns)

Index(['category', 'Content', 'processed_content'], dtype='object')


In [35]:
# Splitting the dataset into train and validate
df, validate_set = train_test_split(main_df, test_size=0.20, random_state=0)

# Conduct oversampling of data to counter imbalance dataset. Only oversample trainingset
df.sort_values(by=['category'], inplace=True)
df.reset_index(drop=True, inplace=True)
counts = df.groupby('category').size()

Oversample cameras
num_to_add_cameras = counts[2] - counts[0]
df_train_cameras = df[df['category']=='cameras']
df_train_cameras = df_train_cameras.append(df_train_cameras.loc[0:num_to_add_cameras-1])

# Oversample laptops
df_train_laptops = df[df['category']=='laptops']
df_train_laptops = df_train_laptops.append(df_train_laptops)
num_to_add_laptops = counts[2] - len(df_train_laptops)
df_train_laptops.reset_index(drop=True, inplace=True)
df_train_laptops = df_train_laptops.append(df_train_laptops.loc[0:(num_to_add_laptops-1)])

# There is no need to oversample mobile phone since it has the largest amount
df_train_mobile_phone = df[df['category']=='mobile phone']

# Append them to make a new training dataset
df = df_train_cameras.append(df_train_laptops.append(df_train_mobile_phone))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [34]:
# Instantiate encoder
Encoder = LabelEncoder()

# Create the dictionary in TFIDF
# There are too many unique words. Set max features to 5000
TFIDF_vect = TfidfVectorizer(max_features=5000)
TFIDF_vect.fit(main_df['processed_content'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
# Export the feature extration to avoid training again
### ONLY RUN THIS TO SAVE feature extration ###

save_FE = open("model_classification/TFIDF_Reviews_Category.pickle","wb")
pickle.dump(TFIDF_vect, save_FE)
save_FE.close()

In [18]:
#Load the saved feature extration
FE_saved = open("model_classification/TFIDF_Reviews_Category.pickle", "rb")
TFIDF_vect = pickle.load(FE_saved)
FE_saved.close()

In [36]:
#Conduct Cross Validation kfold=5 and find the average score of each model

# Store the scores 
NB_Bernoulli_scores = []
NB_Multinomial_scores = []
LogR_scores = []
SVM_scores = []
count = 1

# Store the time taken to build and predict
NB_Bernoulli_build = []
NB_Multinomial_build = []
LogR_build = []
SVM_build = []
NB_Bernoulli_predict = []
NB_Multinomial_predict = []
LogR_predict = []
SVM_predict = []

# Instantiate cross validation folds
ss = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

for train_index, test_index in ss.split(df):
    x_train = df.iloc[train_index, 2] #the 4 partitions, processed_content column
    y_train = df.iloc[train_index, 0] #the 4 partitions, category column
    
    x_test = df.iloc[test_index, 2] #the 1 partitions to test, processed_content column
    y_test = df.iloc[test_index, 0] #the 1 partition to test, category column
    
    # Convert categories to cameras:0, laptops:1, mobile phone:2
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    
    # Transform reviews into TDIFD
    x_train_TFIDF = TFIDF_vect.transform(x_train)
    x_test_TFIDF = TFIDF_vect.transform(x_test)
    
    # Build the classifiers
    
    start = time.time()
    NB_bernoulli_clf = naive_bayes.BernoulliNB()
    NB_bernoulli_clf.fit(x_train_TFIDF, y_train)
    NB_Bernoulli_build.append(time.time() - start)
    
    start = time.time()
    NB_Multinomial_clf = naive_bayes.MultinomialNB()
    NB_Multinomial_clf.fit(x_train_TFIDF, y_train)
    NB_Multinomial_build.append(time.time() - start)
    
    start = time.time()
    SVM_clf = svm.SVC(kernel='linear')
    SVM_clf.fit(x_train_TFIDF, y_train)
    SVM_build.append(time.time() - start)
    
    start = time.time()
    LogR_clf = LogisticRegression()
    LogR_clf.fit(x_train_TFIDF, y_train)
    LogR_build.append(time.time() - start)
    
    # Predict and generate score
    start = time.time()
    y_pred_1 = NB_bernoulli_clf.predict(x_test_TFIDF)
    NB_bernoulli_score = accuracy_score(y_pred_1, y_test)
    NB_Bernoulli_predict.append(time.time() - start)
    
    start = time.time()
    y_pred_2 = NB_Multinomial_clf.predict(x_test_TFIDF)
    NB_Multinomial_score = accuracy_score(y_pred_2, y_test)
    NB_Multinomial_predict.append(time.time() - start)
    
    start = time.time()
    y_pred_3 = SVM_clf.predict(x_test_TFIDF)
    SVM_score = accuracy_score(y_pred_3, y_test)
    SVM_predict.append(time.time() - start)
    
    start = time.time()
    y_pred_4 = LogR_clf.predict(x_test_TFIDF)
    LogR_score = accuracy_score(y_pred_4, y_test)
    LogR_predict.append(time.time() - start)
    
    # Store the score
    NB_Bernoulli_scores.append(NB_bernoulli_score)
    NB_Multinomial_scores.append(NB_Multinomial_score)
    SVM_scores.append(SVM_score)
    LogR_scores.append(LogR_score)
    
    print("Iteration " + str(count) + " (NB Bernoulli): " + str(NB_bernoulli_score))
    print("Iteration " + str(count) + " (NB Multinomial): " + str(NB_Multinomial_score))
    print("Iteration " + str(count) + " (SVM): " + str(SVM_score))
    print("Iteration " + str(count) + " (LogR): " + str(LogR_score))
    
    count = count + 1



Iteration 1 (NB Bernoulli): 0.8550087873462214
Iteration 1 (NB Multinomial): 0.9404217926186291
Iteration 1 (SVM): 0.9674868189806678
Iteration 1 (LogR): 0.9689806678383128




Iteration 2 (NB Bernoulli): 0.8566783831282953
Iteration 2 (NB Multinomial): 0.9366432337434095
Iteration 2 (SVM): 0.9695957820738137
Iteration 2 (LogR): 0.969859402460457




Iteration 3 (NB Bernoulli): 0.8616871704745167
Iteration 3 (NB Multinomial): 0.9408611599297012
Iteration 3 (SVM): 0.9672231985940246
Iteration 3 (LogR): 0.968804920913884




Iteration 4 (NB Bernoulli): 0.8565026362038665
Iteration 4 (NB Multinomial): 0.9375219683655536
Iteration 4 (SVM): 0.9666080843585237
Iteration 4 (LogR): 0.9669595782073813




Iteration 5 (NB Bernoulli): 0.8560632688927944
Iteration 5 (NB Multinomial): 0.9369068541300527
Iteration 5 (SVM): 0.9661687170474517
Iteration 5 (LogR): 0.9676625659050967


In [37]:
print("NB Bernoulli Accuracy from Cross Validation: " + str(statistics.mean(NB_Bernoulli_scores)))
print("NB Multinomial Accuracy from Cross Validation: " + str(statistics.mean(NB_Multinomial_scores)))
print("LogR Accuracy from Cross Validation: " + str(statistics.mean(LogR_scores)))
print("SVM Accuracy from Cross Validation: " + str(statistics.mean(SVM_scores)) + "\n")

print("NB Bernoulli time taken to build: " + str(sum(NB_Bernoulli_build)) + " secs")
print("NB Multinomial time taken to build: " + str(sum(NB_Multinomial_build)) + " secs")
print("LogR Accuracy time taken to build: " + str(sum(LogR_build)) + " secs")
print("SVM Accuracy time taken to build: " + str(sum(SVM_build)) + " secs\n")

print("NB Bernoulli time taken to predict: " + str(sum(NB_Bernoulli_predict)) + " secs")
print("NB Multinomial time taken to predict: " + str(sum(NB_Multinomial_predict)) + " secs")
print("LogR Accuracy time taken to predict: " + str(sum(LogR_predict)) + " secs")
print("SVM Accuracy time taken to predict: " + str(sum(SVM_predict)) + " secs\n")

NB Bernoulli Accuracy from Cross Validation: 0.8571880492091388
NB Multinomial Accuracy from Cross Validation: 0.9384710017574692
LogR Accuracy from Cross Validation: 0.9684534270650264
SVM Accuracy from Cross Validation: 0.9674165202108963

NB Bernoulli time taken to build: 0.2270498275756836 secs
NB Multinomial time taken to build: 0.1380290985107422 secs
LogR Accuracy time taken to build: 9.730051517486572 secs
SVM Accuracy time taken to build: 446.7202858924866 secs

NB Bernoulli time taken to predict: 0.042009830474853516 secs
NB Multinomial time taken to predict: 0.017003297805786133 secs
LogR Accuracy time taken to predict: 0.015002727508544922 secs
SVM Accuracy time taken to predict: 66.76188111305237 secs



In [40]:
# Splitting the dataset into train and validate (via reload of data from processed reviews)
main_df = pd.read_csv('data/processed_reviews.csv')
df, validate_set = train_test_split(main_df, test_size=0.20, random_state=0)

In [41]:
#BASED ON CROSS VALIDATION, LogisticRegression HAS THE BEST EFFICIENCY FOR RESULT

# Use the model with the best score from cross validation of models
# Then, test on validate_set and find the final score

# Transform reviews into TDIFD
x_train = df['processed_content']
x_train_TFIDF = TFIDF_vect.transform(x_train)
x_validate = validate_set['processed_content']
x_validate_TFIDF = TFIDF_vect.transform(x_validate)

# Convert categories to cameras:0, laptops:1, mobile phone:2
y_train = df['category']
y_train = Encoder.fit_transform(y_train)
y_validate = validate_set['category']
y_validate = Encoder.fit_transform(y_validate)

# Build the classifier
classifier = LogisticRegression()
classifier.fit(x_train_TFIDF, y_train)

# Predict
y_pred = classifier.predict(x_validate_TFIDF)
NB_Multinomial_score = accuracy_score(y_pred, y_validate)
print("Final Accuracy: " + str(NB_Multinomial_score))

Final Accuracy: 0.9701933216168717


In [42]:
# Show how well the classifier performs in predicting each of the category
matrix = confusion_matrix(y_pred, y_validate)

cameras_true = matrix[0][0]
cameras_wrong_as_laptops = matrix[1][0]
cameras_wrong_as_mobilephone = matrix[2][0]
accuracy_for_cameras = cameras_true / (cameras_true + cameras_wrong_as_laptops + cameras_wrong_as_mobilephone)

laptops_true = matrix[1][1]
laptops_wrong_as_cameras = matrix[0][1]
laptops_wrong_as_mobilephone = matrix[2][1]
accuracy_for_laptops = laptops_true / (laptops_true + laptops_wrong_as_cameras + laptops_wrong_as_mobilephone)

mobilephone_true = matrix[2][2]
mobilephone_wrong_as_laptops = matrix[1][2]
mobilephone_wrong_as_cameras = matrix[0][2]
accuracy_for_mobilephone = mobilephone_true / (mobilephone_true + mobilephone_wrong_as_laptops + mobilephone_wrong_as_cameras)

In [44]:
print("Accuracy for predicting camera: " + str(accuracy_for_cameras))
print("Accuracy for predicting laptop: " + str(accuracy_for_laptops))
print("Accuracy for predicting mobile phone: " + str(accuracy_for_mobilephone))

Accuracy for predicting camera: 0.9693500738552437
Accuracy for predicting laptop: 0.9299332697807435
Accuracy for predicting mobile phone: 0.9834599910594546


In [12]:
# The following steps are to generate the predicted labels for each reviews

new_df = pd.read_csv('data/preprocessed_reviewinfo.csv')
not_used_set, validate_set_1 = train_test_split(new_df, test_size=0.20, random_state=0)
validate_set_1.reset_index(drop=True, inplace=True)

# convert predicted reviews into dataframe
df_predicted = pd.DataFrame(y_pred, columns=["predicted_category"])

# map the predicted values back to original categories
category_mapper = {
    0: 'cameras',
    1: 'laptops',
    2: 'mobile phone'
}
df_predicted['predicted_category'] = df_predicted['predicted_category'].map(category_mapper)

# dataframe that contains the predicted and true category for each review
df_results = validate_set_1.join(df_predicted)

# Export out the reviews with predicted categories to avoid processing again
df_results.to_csv(r'data/reviews_with_predicted_categories.csv', index=False)

### Save and Load (Recommend to load instead of running code to get classifier)

In [13]:
# Export the naive bayes classifier to avoid training again
### ONLY RUN THIS TO SAVE CLASSIFIER ###

save_classifier = open("model_classification/CategoryClassifier.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [14]:
#Load the saved classifier
classifier_saved = open("model_classification/CategoryClassifier.pickle", "rb")
classifier = pickle.load(classifier_saved)
classifier_saved.close()

## Feature Extraction via Naive Bayes

In [15]:
# Re-read the file
main_df = pd.read_csv('data/preprocessed_reviewinfo.csv')

# Splitting the dataset into train and validate
df, validate_set = train_test_split(main_df, test_size=0.20, random_state=0)
stop_list = nltk.corpus.stopwords.words('english')

In [16]:
# Label each reviews with respective category for train set

cameras_docs = []
laptops_docs = []
mobile_phone_docs = []

df.sort_values(by=['category'], inplace=True)
for index, row in df.iterrows():
    if row['category'] == 'cameras':
        holder = [word_tokenize(row['Content']), 'cameras']        
        cameras_docs.append(holder)
        
    elif row['category'] == 'laptops':
        holder = [word_tokenize(row['Content']), 'laptops'] 
        laptops_docs.append(holder)
        
    elif row['category'] == 'mobile phone':
        holder = [word_tokenize(row['Content']), 'mobile phone'] 
        mobile_phone_docs.append(holder)

# Combine the documents of each categories into a list
all_docs = cameras_docs + laptops_docs + mobile_phone_docs
random.shuffle(all_docs)

# Take 2000 random reviews and use their words for feature extraction
all_words = []
for i in range(0, 2000):
    all_words = all_words + all_docs[i][0]

# Processsing for stopwords, alphabetic words.
# We do not stem the word because we want readibility in feature extraction
all_words_1 = [w.lower() for w in all_words]
all_words_2 = [w for w in all_words_1 if re.search('^[a-z]+$',w)]
all_words_3 = [w for w in all_words_2 if w not in stop_list]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [17]:
# Build the naive bayes feature extraction model

start = time.time()

all_words_freq = nltk.FreqDist(w.lower() for w in all_words_3)
word_features = list(all_words_freq)[:1000]

def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

featuresets_train = [(document_features(d), c) for (d,c) in all_docs]
classifier = nltk.NaiveBayesClassifier.train(featuresets_train)

end = time.time()
print("time taken: " + str((end - start)) + " secs")

time taken: 83.79626893997192 secs


In [4]:
classifier.show_most_informative_features(10)

Most Informative Features
          contains(lens) = True           camera : laptop =   1115.1 : 1.0
      contains(keyboard) = True           laptop : camera =    670.4 : 1.0
    contains(chromebook) = True           laptop : mobile =    341.9 : 1.0
         contains(canon) = True           camera : mobile =    280.4 : 1.0
     contains(telephoto) = True           camera : mobile =    251.7 : 1.0
           contains(sim) = True           mobile : camera =    245.3 : 1.0
         contains(mouse) = True           laptop : camera =    224.3 : 1.0
      contains(contract) = True           mobile : camera =    198.5 : 1.0
         contains(nikon) = True           camera : mobile =    180.2 : 1.0
       contains(prepaid) = True           mobile : camera =    170.5 : 1.0


In [47]:
# Extract the key features and put into dataframe

list_1 = classifier.most_informative_features(20)
df_important_features = pd.DataFrame(columns=['Feature','Category_1',
                                              'Category_0','Cat1_Cat0','Ratio','Ratio_1'])

for (fname, fval) in list_1:
    cpdist = classifier._feature_probdist
    
    def labelprob(l):
        return cpdist[l, fname].prob(fval)

    labels = sorted(
        [l for l in classifier._labels if fval in cpdist[l, fname].samples()],
        key=labelprob
    )
    
    if len(labels) == 1:
        continue
    l0 = labels[0]
    l1 = labels[-1]
    if cpdist[l0, fname].prob(fval) == 0:
        ratio = 'INF'
    else:
        ratio = round(cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval), 1)
        fname = fname.replace('contains(','')
        fname = fname.replace(')','')        
        df_important_features.loc[len(df_important_features)] = [fname, l1, l0, l1+" : "+l0, 
                                                ratio, str(ratio)+" : 1.0"]
        

In [49]:
# Export into csv if needed
df_important_features.to_csv(r'data/important_features.csv', index=False)

### Save and Load (Recommend to load instead of running code to get feature extractor)

In [19]:
# Export the feature extractor to avoid training again
### ONLY RUN THIS TO SAVE FEATURE EXTRACTOR ###

save_classifier = open("model_classification/FeatureExtraction.pickle","wb") #binary write
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [46]:
#Load the saved classifier 
classifier_saved = open("model_classification/FeatureExtraction.pickle", "rb") #binary read
classifier = pickle.load(classifier_saved)
classifier_saved.close()