In [1]:
# import all necessary modules

import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import ShuffleSplit
from sklearn import svm
from sklearn.svm import LinearSVC

In [2]:
# read all individual text reviews to a list of lists then convert to a dataframe

# location of folder with reviews
filepath = 'C:/Users/jho/Desktop/data_science/op_spam_v1.4'
# empty list
list_reviews = []

# first level
negative_or_positive = 'negative_polarity'

# 1600 to unpack (1 folder)
for polarity in os.listdir(filepath):
    # second level
    deceptive_or_truthful = 'deceptive_from_MTurk'
    # 800 to unpack (2 folders)
    for classification in os.listdir(filepath+'/{}'.format(negative_or_positive)):
        i = 1
        # 400 to unpack (4 folders, or 2 in each subfolder) (third level)
        for fold in os.listdir(filepath+'/{}'.format(negative_or_positive)+'/{}'.format(deceptive_or_truthful)):
            #80 to unpack (there are 80 reviews in each fold, and there are 5 folds with each of the 4 subfolders) (fourth level)
            for file in os.listdir(filepath+'/{}'.format(negative_or_positive)+'/{}'.format(deceptive_or_truthful)+'/fold{}'.format(i)):
                # unpack and read (read each file as a review as you unpack all the folders)
                with open(filepath+'/{}'.format(negative_or_positive)+'/{}'.format(deceptive_or_truthful)+'/fold{}/'.format(i)+file) as f:
                    list_reviews.append([file, f.readline()])
                    
            i += 1
        
        # for the folders that have different names than deceptive_from_MTurk in the deceptive/truthful folders
        if len(list_reviews) < 401:
            deceptive_or_truthful = 'truthful_from_Web'
        elif len(list_reviews) > 402:
            deceptive_or_truthful = 'truthful_from_TripAdvisor'
            
    # change the folder for polarity
    negative_or_positive = 'positive_polarity'
    

# create all_reviews_df for list_reviews
all_reviews_df = pd.DataFrame(list_reviews, columns=['index', 'review'])


# add columns for the classification to the all_reviews_df dataframe. 1 is deceptive/negative and 0 is truthful/positive
all_reviews_df.insert(2, 'negative', np.nan)
all_reviews_df.insert(3, 'deceptive', np.nan)

#print(all_reviews_df)

# counter i to iterate over all reviews in all_reviews_df
for i in range(0, len(all_reviews_df)):
    
    # indices 0-799 are negative polarity
    if i < 800:
        all_reviews_df['negative'].loc[i] = '1'
    else:
        all_reviews_df['negative'].loc[i] = '0'
        
    # indices 400-799 are truthful
    if 400 <= i < 800:
        all_reviews_df['deceptive'].loc[i] = '0'
    # indices 1200-1599 are truthful
    elif i >= 1200:
        all_reviews_df['deceptive'].loc[i] = '0'
    # the rest are deceptive
    else:
        all_reviews_df['deceptive'].loc[i] = '1'
        
#print(all_reviews_df)

# create a dataframe with only the reviews
text_only = all_reviews_df['review']

#print(text_only.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [3]:
#### for tfidf vectorizer ####

# creates tfidf vector / matrix for X and the target y, using original tfidf vector

def make_xy(all_reviews_df, tfidf=None):
    #Your code here    
    if tfidf is None:
        tfidf = TfidfVectorizer(min_df=0, ngram_range=(1,2))
    X = tfidf.fit_transform(all_reviews_df['review'])
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (all_reviews_df.deceptive == '1').values.astype(np.int)
    return X, y, tfidf

In [4]:
# TFIDF Vectorizer for MultinomialNB and Linear SVM

# creates:  x_train: Xtfidf, y_train: ytfidf
#           x_test: Xtfidf_test aka the transformed reviews located at the test_index locations, y_test = X_test labels

# shuffle split the reviews and test on 400 reviews
ss = ShuffleSplit(n_splits=1, test_size=0.25)

# splitting the reviews and returning their index from the dataframe
for train_index, test_index in ss.split(all_reviews_df['review']):
    
    #print("%s %s" % (train_index, test_index))
    #print("%s" % (test_index))
    
    # create tfidf vectors from make_xy function using the training indices
    Xtfidf, ytfidf, tfidf = make_xy(all_reviews_df.loc[train_index])
    
    # select classifier. choosing linear svc
    clf = LinearSVC()
    #clf = svm.SVC(kernel='linear')
    #clf = MultinomialNB()
    clf_fit = clf.fit(Xtfidf, ytfidf)
    # transform the tfidf vector and use as x_test for predictions
    Xtfidf_test = tfidf.transform(all_reviews_df['review'][test_index])
    prediction = clf_fit.predict(Xtfidf_test)
    
    #print(prediction)
    
    #print(all_reviews_df['deceptive'][test_index])
    #print(ytfidf[0])

    # get the accuracy scores from training and testing sets
    training_accuracy = clf.score(Xtfidf, ytfidf)
    testing_accuracy = clf.score(Xtfidf_test, all_reviews_df['deceptive'][test_index].astype(np.int))

    print("Accuracy on training data: {:2f}".format(training_accuracy))
    print("Accuracy on test data:     {:2f}".format(testing_accuracy))
    print('')
    
    ## this prints all of the incorrectly classified reviews

    # empty list
    wrong_predictions = []

    # search where the predicted label is not the same as the label in all_reviews_df
    for index in range(len(prediction)):
        if all_reviews_df['deceptive'].astype(np.int)[test_index[index]] != prediction[index]:
            wrong_predictions.append([all_reviews_df['index'].loc[test_index[index]], all_reviews_df['review'].loc[test_index[index]], all_reviews_df['deceptive'].loc[test_index[index]]])

    # convert to dataframe
    wrong_predictions = pd.DataFrame(wrong_predictions)
    print(len(wrong_predictions))
    print('')
    
    ## this prints all of the correctly classified reviews

    # empty list
    right_predictions = []

    # search where the predicted label is not the same as the label in all_reviews_df
    for index in range(len(prediction)):
        if all_reviews_df['deceptive'].astype(np.int)[test_index[index]] == prediction[index]:
            right_predictions.append([all_reviews_df['index'].loc[test_index[index]], all_reviews_df['review'].loc[test_index[index]], all_reviews_df['deceptive'].loc[test_index[index]]])

    # convert to dataframe
    right_predictions = pd.DataFrame(right_predictions)
    print(len(right_predictions))
    print('')   

Accuracy on training data: 1.000000
Accuracy on test data:     0.890000

44

356



In [5]:
# truthful reviews classified as deceptive
false_positives = wrong_predictions[wrong_predictions[2] == '0']
#false_positives

# deceptive reviews classified as deceptive
true_positives = right_predictions[right_predictions[2] == '1']
#true_positives

In [6]:
# get the true positive and false positive features from the same tfidf vector

# use THE SAME tfidf vector (bi-grams) from make_xy to the true and false positives
# the words have the same column indices for tp and fp

# create the compressed sparse matrices for true and false positives
csr_mat_tp = tfidf.transform(true_positives[1])
csr_mat_fp = tfidf.transform(false_positives[1])

# average importance for features
tp_averages = csr_mat_tp.todense().mean(axis=0)
fp_averages = csr_mat_fp.todense().mean(axis=0)

# creates dict of all features from the tfidf vector, convert to list
index_words = tfidf.vocabulary_
index_words_list = list(index_words)

In [7]:
# true positive and false positive features and their importance

tp_feat_avg = []
fp_feat_avg = []

for word, col in index_words.items():
    tp_feat_avg.append((tp_averages[0, col]))#, index_words[word]))
    fp_feat_avg.append((fp_averages[0, col]))#, index_words[word]))
    
tp_feat_avg_word = []
fp_feat_avg_word = []

for i in range(len(tp_feat_avg)):
    tp_feat_avg_word.append(list((index_words_list[i], tp_feat_avg[i])))
    fp_feat_avg_word.append(list((index_words_list[i], fp_feat_avg[i])))
    
# sort because this matches the features by index since they are the same vector
tp_feat_avg_word.sort()
fp_feat_avg_word.sort()


# calculate the differences for each feature's importance

# empty list
feature_differences = []

for i in range(len(tp_feat_avg_word)):
    difference = fp_feat_avg_word[i][1] - tp_feat_avg_word[i][1]
    feature_differences.append((difference, fp_feat_avg_word[i][0]))
    
# sort descending by feature importance
feature_differences.sort(reverse=True)

for i in range(100):
    print(feature_differences[i])

# two resulting series and take the difference of the features. 
# explain what you did and why you did it
# how you could improve it further
# how you maintain your model, there is no good answer because language changes

(0.016934888571398698, 'neighborhood')
(0.015952437608524717, 'we')
(0.015022161677194676, 'nicer hotel')
(0.014649130457456987, 'what can')
(0.014503761681617076, 'bottle')
(0.013987166479346929, 'lobby')
(0.013616447068412182, 'ordinary')
(0.013155305374475108, 'premium')
(0.013124904898563697, 'the lobby')
(0.012757858932937523, 'impolite')
(0.012625957983958914, 'your')
(0.012577414344585241, 'of my')
(0.012573843222988405, 'september')
(0.012445173809542499, 'broken')
(0.012424031752740154, 'you want')
(0.012169451521540883, 'couldn')
(0.012157495904193429, 'of soap')
(0.012157495904193429, 'bar of')
(0.012104981402490431, 'they should')
(0.012061211061307319, 'we couldn')
(0.01203356638288209, 'my wedding')
(0.011443943735576159, 'painted')
(0.011422265014394946, 'was slow')
(0.011119567514274621, 'he')
(0.011017069429409394, 'downtown')
(0.01093332330294457, 'shampoo')
(0.010687034124596465, 'phone and')
(0.010578955646528571, 'want to')
(0.010493358468372441, 'two people')
(0.0