# Movie Review Analysis Outline
## import necessary python libraries


In [1]:
# "transformData.py" is this script which is intended to be run once on our data
# transform the raw data into a rdd readable format first

# import all necessary libraries
import re
import string
from operator import add
import os
import sys

import pyspark.mllib.regression as mllib_reg
import pyspark.mllib.linalg as mllib_lalg
import pyspark.mllib.classification as mllib_class
import pyspark.mllib.tree as mllib_tree


from pyspark.mllib.classification import LogisticRegressionWithLBFGS , LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD, SVMModel

from pyspark.mllib.feature import HashingTF, IDF




print (pyspark) # test to see that pyspark is up and running okay



<module 'pyspark' from '/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/__init__.py'>


# CLEAN DATA

In [2]:
# get rid of all html tags in the data
def strip_html_tags(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data.lower())

#parse and take care of funky symbols in summary and text <- normalize to root words maybe?
def cleanData (data):
    data = strip_html_tags(data)
    data =  re.sub("[\t\,\:;\(\)\"\'\~\-\!\?\`]", "",data, 0, 0)
    #data =  re.sub("[\.0`]", "",data, 0, 0) # special case to get rid of ".0" of scores -- 
    return data

#re.sub("[\.\t\,\:;\(\)\.]", " ", strip_html_tags(data.lower()), 0, 0)

# clean the data for each element of each sub list
def prepData (list_str):
    L= []
    for x in list_str:
        #print x
        L.append(cleanData(x).strip())
    return L
#####
# Notes 
# headers for the project
# [u'productId', u'userId', u'profileName', u'helpfulness', u'score', u'time', u'summary', u'text']



# PARSE THE FILE WE WANT TO USE AS OUR DATA

In [3]:

# transform Data uses /// as a separator for each eleent of data for each movie review
# the original file for the current output.txt is a truncated version of all our data, so the end elements will be funky

# output < smaller_parsed

movies_txt2 = sc.textFile("100k_parsed.txt").map(lambda x: (cleanData(x).split('///')))


## Parsing out relevant fields - score and text of review

In [4]:
# prep our data

#####
# Notes 
# headers for the project
# [u'productId', u'userId', u'profileName', u'helpfulness', u'score', u'time', u'summary', u'text']
# [u'productId', _ , _ , _ , u'score' [0] to get rid of .0, _  u'summary', u'text']

# take the data that we care most about
movies_new = movies_txt2.map(lambda L: (L[0], L[4][0], L[6], L[7]) if len(L) == 8 else L )
movies_new1 = movies_new.filter(lambda L: len(L) == 4) # lets just take all the data that has been parsed correctly

#print (movies_new.top(20))
print (movies_new1.count(), movies_new.count())
print ("Number of datapoints in dataset to train: ", movies_new1.count())
print (movies_new.count() == movies_new1.count())

removeHTMLTags = movies_new1.map(prepData) # remove html tags

(99991, 100000)
('Number of datapoints in dataset to train: ', 99991)
False


## REVISED STOP WORDS REMOVAL 

In [5]:
#REVISED STOP WORDS REMOVAL Approach

#used the stopwords file in the virtual box instead... taking too long
baseDir = os.path.join('../data')
inputPath = os.path.join('cs100', 'lab3')
STOPWORDS_PATH = 'stopwords.txt'
split_regex = r'\W+'

stopfile = os.path.join(baseDir, inputPath, STOPWORDS_PATH)
stopwords = set(sc.textFile(stopfile).collect())


def tokenize(string):
    #reusing assignment code
    """ An implementation of input string tokenization to exclude stopwords
    Args:
        string (str): input string
    Returns:
        list: a list of tokens without stopwords
    """
    splitList = re.split(split_regex, string.lower().strip())
    
    return [x for x in splitList if x!="" and x not in stopwords]  



removeSW_output= removeHTMLTags.map(lambda x: (  int(x[1]), tokenize(x[3]))  )
print removeSW_output.top(1)

[(5, [u'zulu', u'superlative', u'depiction', u'19th', u'century', u'battle', u'rourkes', u'drift', u'british', u'outpost', u'heavily', u'outnumbered', u'seige', u'zulu', u'warriors', u'fought', u'desperately', u'almost', u'last', u'man', u'somehow', u'prevailed', u'terrifying', u'exciting', u'war', u'film', u'unlike', u'splendid', u'cast', u'word', u'diamond', u'entertainment', u'tapesmy', u'experience', u'slp', u'speed', u'vhs', u'videos', u'tracks', u'remarkably', u'well', u'usually', u'recorded', u'hifi', u'sound', u'well', u'standard', u'linear', u'format', u'zulu', u'also', u'available', u'dvd', u'also', u'recommendedzulu', u'dawn', u'1979', u'tells', u'complete', u'story', u'struggles', u'tribesmen', u'british', u'soldiers', u'commenced', u'1879', u'burt', u'lancaster', u'vhs', u'edition', u'dvd', u'editionparenthetical', u'number', u'preceding', u'title', u'1', u'10', u'viewer', u'poll', u'rating', u'found', u'film', u'resource', u'website', u'8', u'0', u'zulu', u'uk1964', u'sta

## Vectorizing - making the vector of features...

In [6]:
# tokenize our training set and testing set
removeSW_output= removeHTMLTags.map(lambda x: (  int(x[1]), tokenize(x[3]))  )
print removeSW_output.top(1)

[(5, [u'zulu', u'superlative', u'depiction', u'19th', u'century', u'battle', u'rourkes', u'drift', u'british', u'outpost', u'heavily', u'outnumbered', u'seige', u'zulu', u'warriors', u'fought', u'desperately', u'almost', u'last', u'man', u'somehow', u'prevailed', u'terrifying', u'exciting', u'war', u'film', u'unlike', u'splendid', u'cast', u'word', u'diamond', u'entertainment', u'tapesmy', u'experience', u'slp', u'speed', u'vhs', u'videos', u'tracks', u'remarkably', u'well', u'usually', u'recorded', u'hifi', u'sound', u'well', u'standard', u'linear', u'format', u'zulu', u'also', u'available', u'dvd', u'also', u'recommendedzulu', u'dawn', u'1979', u'tells', u'complete', u'story', u'struggles', u'tribesmen', u'british', u'soldiers', u'commenced', u'1879', u'burt', u'lancaster', u'vhs', u'edition', u'dvd', u'editionparenthetical', u'number', u'preceding', u'title', u'1', u'10', u'viewer', u'poll', u'rating', u'found', u'film', u'resource', u'website', u'8', u'0', u'zulu', u'uk1964', u'sta

In [7]:
# We will use the spark ml library for this (took a bit long to figure out)
# prepare hashing for the data 
# HashingTF calculates a number given a  number for features for a list of words in a document
#htf = HashingTF(numFeatures=2) # features need to match dimensions of output

htf = HashingTF(numFeatures=200) 

hashedElements = removeSW_output.map(lambda x : htf.transform(x[1]))

labels = removeSW_output.map(lambda x : 1.0 if float(x[0]) >= 2.5 else 0.0)
#print hashedElements.top(2)
'''
[SparseVector(200, {1: 1.0, 32: 1.0, 45: 1.0, 90: 2.0, 94: 1.0, 124: 1.0, 126: 1.0, 135: 1.0, 138: 1.0, 185: 1.0, 199: 1.0}), SparseVector(200, {1: 2.0, 3: 1.0, 12: 2.0, 17: 1.0, 24: 1.0, 34: 1.0, 38: 1.0, 39: 1.0, 47: 2.0, 64: 1.0, 68: 2.0, 70: 1.0, 72: 1.0, 75: 1.0, 80: 1.0, 81: 1.0, 87: 1.0, 90: 1.0, 99: 1.0, 102: 1.0, 108: 1.0, 110: 1.0, 112: 1.0, 115: 1.0, 119: 1.0, 122: 1.0, 124: 1.0, 130: 1.0, 135: 1.0, 137: 1.0, 143: 1.0, 147: 1.0, 151: 1.0, 155: 1.0, 157: 1.0, 159: 2.0, 162: 1.0, 166: 1.0, 171: 1.0, 179: 1.0, 186: 1.0, 191: 1.0, 196: 1.0, 198: 1.0})]

'''

hashedElements.cache()
idf = IDF().fit(hashedElements)
tfidf = idf.transform(hashedElements)
"""
IDF(minDocFreq=2).fit"""


#merge transformed data with original
merged=  labels.zip(tfidf)
print merged.top(2)
print 
feature_vector  = merged.map(lambda q: LabeledPoint(q[0], q[1])   )
print feature_vector.top(1)


#feature_vector = removeSW3.map(lambda elements: LabeledPoint(1, [2]))
print feature_vector.filter(lambda x: x.label == 1.0).count()

[(1.0, SparseVector(200, {2: 2.3367, 5: 1.432, 7: 1.2508, 8: 1.4712, 9: 4.5551, 14: 1.3862, 15: 4.3128, 16: 3.5716, 17: 2.5615, 20: 4.4453, 21: 1.2526, 22: 1.1598, 25: 2.8899, 28: 4.1005, 35: 1.5192, 36: 1.746, 37: 1.8737, 41: 2.7183, 42: 1.3485, 45: 2.0726, 46: 1.8449, 47: 1.9309, 49: 1.5994, 51: 1.4061, 54: 1.5251, 59: 1.4766, 60: 1.06, 63: 1.3453, 64: 2.4572, 66: 2.5467, 67: 1.8286, 69: 1.309, 72: 5.3366, 75: 1.3871, 76: 1.3512, 77: 4.8183, 79: 1.0238, 80: 4.3744, 83: 4.8297, 87: 3.4432, 88: 1.3486, 92: 1.5477, 96: 1.8405, 97: 1.712, 100: 1.4397, 101: 3.231, 102: 1.55, 103: 5.1782, 104: 1.0877, 107: 1.1457, 109: 1.5145, 110: 2.233, 111: 4.678, 112: 4.1476, 113: 1.3248, 115: 1.4954, 116: 3.1415, 117: 2.3327, 119: 1.4568, 120: 1.3024, 121: 3.3429, 122: 1.8031, 123: 1.3322, 124: 1.1648, 125: 4.3824, 127: 4.7447, 131: 1.1393, 132: 1.7233, 133: 0.8136, 137: 2.6335, 139: 1.93, 143: 3.5102, 145: 2.3686, 149: 3.768, 151: 3.892, 154: 1.5942, 157: 1.5548, 158: 3.0743, 161: 1.4814, 162: 1.3929

##  Training a SVM classifier

In [8]:
# train a SVM classifier using 70% training data and 30% testing data from our file
svm_training_data, svm_testing_data= feature_vector.randomSplit([0.7, 0.3])

#LogisticRegressionWithLBFGS 
# Build the Support Vector Machine model
svm_model = SVMWithSGD.train(svm_training_data, iterations=500) # lower iterations so we arent here forever

# Evaluate the model on training data
svm_labelsAndPreds = svm_training_data.map(lambda p: (p.label, svm_model.predict(p.features)))
svm_trainErr = 100* svm_labelsAndPreds.filter(lambda (v, p): v != p).count() / float(svm_training_data.count())
print("training error of the SVM model: " + "%.3f" %(svm_trainErr))
print("training accuracy of the SVM model: " + "%.3f" %(100-svm_trainErr))
print

#Evaluate on testing data.. the correct way lolz

svm_labelsAndPredsTest = svm_testing_data.map(lambda p: (p.label, svm_model.predict(p.features)))
svm_testErr = 100* svm_labelsAndPredsTest.filter(lambda (v, p): v != p).count() / float(svm_testing_data.count())
print("testing error of the SVM model: " + "%.3f" %(svm_testErr))
print("testing accuracy of the SVM model: " + "%.3f" %(100-svm_testErr))


training error of the SVM model: 14.843
training accuracy of the SVM model: 85.157

testing error of the SVM model: 14.565
testing accuracy of the SVM model: 85.435


## Train a Logistic Regression Classifier

In [9]:
# train a log_regression classifier using 70% training data and 30% testing data from our file

log_training_data, log_testing_data= feature_vector.randomSplit([0.7, 0.3])

#LogisticRegressionWithLBFGS 
# Build the Support Vector Machine model
log_model = LogisticRegressionWithLBFGS.train(log_training_data, iterations=500) # lower iterations so we arent here forever

# Evaluate the model on training data
log_labelsAndPreds = log_training_data.map(lambda p: (p.label, log_model.predict(p.features)))
log_trainErr = 100* log_labelsAndPreds.filter(lambda (v, p): v != p).count() / float(log_training_data.count())
print "Log Training Accuracy: " + "%.3f" %(100-log_trainErr) + "%" 
print("training error of the LogisticRegressionWithLBFGS model= " + "%.3f" % (log_trainErr)) + "%" 

#Evaluate on testing data.. the correct way lolz

log_labelsAndPredsTest = log_testing_data.map(lambda p: (p.label, log_model.predict(p.features)))
log_testErr = 100* log_labelsAndPredsTest.filter(lambda (v, p): v != p).count() / float(log_testing_data.count())
print 
print "Log Testing Accuracy: " + "%.3f" %(100-log_testErr) + "%" 
print("testing error of the LogisticRegressionWithLBFGS model: " + "%.3f" %(log_testErr)) + "%" 


Log Training Accuracy: 84.823%
training error of the LogisticRegressionWithLBFGS model= 15.177%

Log Testing Accuracy: 84.772%
testing error of the LogisticRegressionWithLBFGS model: 15.228%


## Train a Naive Bayes Classifier

In [10]:
### THIS USES NAIVE BAYES CLASSIFIER


# would be nice to figure out which division is the best
#training_data, testing_data= feature_vector.randomSplit([0.6, 0.4])
bayes_training_data, bayes_testing_data= feature_vector.randomSplit([0.7, 0.3])

# parameters:
lamda = 1.0

# build Naive Bayes Classifier
nbay = mllib_class.NaiveBayes.train(bayes_training_data, lamda)

# Make prediction and test accuracy.




bayes_labelsAndPreds = bayes_training_data.map(lambda p : (nbay.predict(p.features), p.label))
#bayes_trainErr = bayes_labelsAndPreds.filter(lambda (v, p): v != p).count() / float(bayes_training_data.count())
bayes_trainErr = 100.0 * bayes_labelsAndPreds.filter(lambda (x, v): x == v).count() / bayes_training_data.count()

print "Naive Bayes Training Accuracy: " + "%.3f" %(bayes_trainErr) + "%" 
print "Bayes Training error: " + "%.3f" % (100-bayes_trainErr) + "%" 

print 
# Testing Data Accuracy

bayes_labelsAndPreds = bayes_testing_data.map(lambda p : (nbay.predict(p.features), p.label))
#bayes_testErr = bayes_labelsAndPreds.filter(lambda (v, p): v != p).count() / float(bayes_testing_data.count())
bayes_testErr = 100.0 * bayes_labelsAndPreds.filter(lambda (x, v): x == v).count() / bayes_testing_data.count()


print "Naive Bayes Testing Accuracy: " + "%.3f" %(bayes_testErr) + "%" 
print "Bayes Testing error: " + "%.3f" % (100-bayes_testErr) + "%" 

Naive Bayes Training Accuracy: 84.638%
Bayes Training error: 15.362%

Naive Bayes Testing Accuracy: 84.251%
Bayes Testing error: 15.749%


# Prepare a new data set for using our models to predict the labels

In [12]:
# PREDICTION - LETS USE OUR MODELS to predict what label we think it is (and compare it to actual values we have)
# notes these data points are distinct from those used to train and test our models


# first apply tokenization
#smaller_predictData
newdata = sc.textFile("smaller_predictData.txt").map(lambda x: (cleanData(x).split('///')))

# parse data for the params we want
strippedData = newdata.map(lambda L: (L[0], float(L[4]), L[6], L[7]) if len(L) == 8 else L )
completeData = strippedData.filter(lambda L: len(L) == 4)
print (strippedData.count())
print ("Number of datapoints in dataset to predict: ",completeData.count())
print (strippedData.count() == completeData.count())


movieData = completeData.map(lambda x: (  x[1], tokenize(x[3]))   )

#.filter(lambda x: float (x) >= 2.5).collect())
#print movieData.filter(lambda x: float(x[1]) >= 2.5).top(10)
#print movieData.filter(lambda x: float(x[1]) >= 2.5).count()


10002
('Number of datapoints in dataset to predict: ', 10000)
False


## Hash new tokenized data into numeric values for our models

In [13]:
# HASH OUR DATA TO PREDICT

htf = HashingTF(numFeatures=200) 
predict_hashedElements = movieData.map(lambda x : htf.transform(x[1]))

predict_labels = movieData.map(lambda x : 1.0 if float(x[0]) >= 2.5 else 0.0)
#print hashedElements.top(2)
'''
[SparseVector(200, {1: 1.0, 32: 1.0, 45: 1.0, 90: 2.0, 94: 1.0, 124: 1.0, 126: 1.0, 135: 1.0, 138: 1.0, 185: 1.0, 199: 1.0}), SparseVector(200, {1: 2.0, 3: 1.0, 12: 2.0, 17: 1.0, 24: 1.0, 34: 1.0, 38: 1.0, 39: 1.0, 47: 2.0, 64: 1.0, 68: 2.0, 70: 1.0, 72: 1.0, 75: 1.0, 80: 1.0, 81: 1.0, 87: 1.0, 90: 1.0, 99: 1.0, 102: 1.0, 108: 1.0, 110: 1.0, 112: 1.0, 115: 1.0, 119: 1.0, 122: 1.0, 124: 1.0, 130: 1.0, 135: 1.0, 137: 1.0, 143: 1.0, 147: 1.0, 151: 1.0, 155: 1.0, 157: 1.0, 159: 2.0, 162: 1.0, 166: 1.0, 171: 1.0, 179: 1.0, 186: 1.0, 191: 1.0, 196: 1.0, 198: 1.0})]

'''

predict_hashedElements.cache()
pidf = IDF().fit(predict_hashedElements)
ptfidf = pidf.transform(predict_hashedElements)


"""
IDF(minDocFreq=2).fit"""


#merge transformed data with original
predict_merged=  predict_labels.zip(ptfidf)
#print predict_merged.top(2)
print 
featurePV  = predict_merged.map(lambda q: LabeledPoint(q[0], q[1])   )
print featurePV.top(1)


#feature_vector = removeSW3.map(lambda elements: LabeledPoint(1, [2]))
#print featurePV.filter(lambda x: x.label == 1.0).count()

# 0.8588372954


[LabeledPoint(1.0, (200,[2,12,20,43,45,48,54,63,69,72,84,91,92,98,101,102,104,110,111,114,117,120,125,141,147,152,154,159,169,179,191,192,196],[0.755122579278,0.628583662235,1.41232773815,0.954352233162,0.995704533594,1.51014516356,1.44190706605,1.30242092234,1.25221317465,1.33989252247,1.13641415085,1.44190706605,1.36424155148,1.35955770217,1.62566728936,1.56217733607,1.07333697734,1.09432193537,1.54562458169,1.62312986095,1.1136228964,1.2773605951,1.36855443799,1.14109800016,0.768401849161,2.95176304524,2.94390448896,1.50878800433,1.5669574599,1.50202969972,1.23521952832,3.18771971116,1.13423583974]))]


## PREDICTING

### SVM Classifier

In [14]:
# SVM PREDICTIONS

#print (svm_model) 
"""
(weights=[0.0368997186104,0.0592059564322,0.0511764523985,0.0944362393672,-0.1234655326,-0.0213073956774,0.0598834383627,0.0301296198158,0.0071253205336,-0.0773313907846,0.135198560493,0.0150622838296,0.252044696973,-0.0485074421553,0.0472279727127,-0.0602539313766,0.0113035109407,0.0384187101225,-0.188469809917,0.00267308133624,-0.0642051128667,0.0501615460127,0.185930140409,-0.0366052969578,0.056863287584,0.0493490204659,0.0300440006011,-0.0113687899148,-0.0502989129382,-0.00495112968516,0.0705056106545,0.0723744391356,-0.00780960280399,0.0648763892711,-0.0142119736707,-0.0101463323161,-0.0950565271662,-0.025407627537,0.0165633108803,0.0726917340991,-0.0543413768729,-0.11493914306,0.048542911942,-0.00490814080509,0.00183017415496,-0.0154270882776,-0.0205600111133,0.190822611363,-0.0085785906532,0.00292457229289,0.0908497832715,-0.0306247814764,-0.0272281652519,-0.00461787021553,-0.0155235065268,0.0296160453275,-0.139970211245,0.0709340007922,0.0141226375075,0.0767565407429,0.0923194359137,-0.0793072158897,-0.162151121072,-0.00582485572825,-0.027916288075,-0.0210472483295,0.00345838293048,0.013724710527,0.101103869025,-0.0297336253554,0.0279691564116,0.0689764908622,-0.13494363691,-0.0668335425458,-0.060322739572,-0.0956377298681,0.0769876814833,0.0117181091065,0.0126856313014,0.109426784166,0.0861160290819,-0.0244508540554,0.0161106487066,0.0400982188816,-0.00548897974707,-0.019190100889,0.00461539237948,0.149560544582,0.0121178683009,-0.0639882247701,0.529805130452,-0.0128682615876,0.00480636850342,-0.00410932632816,0.0351692687322,-0.0198583418872,-0.0229548568145,-0.00340860804861,0.0333486117234,-0.0330106365069,0.0412798754816,0.0537416796427,0.0374752509553,0.0180206899498,-0.00407131571665,-0.0479947909314,-0.0435645227459,-0.00546051146944,-0.0673355134481,-0.0122398316702,-0.0386454914434,0.0319355800185,-0.0641061556095,0.285542644392,-0.0755651792838,-0.0206629484963,0.023072234171,0.0700045307581,-0.0248450979802,0.0345836657542,0.043560324962,0.189977075545,-0.0267604623301,-0.134677417127,0.0549936914647,0.0470644285613,0.0516906946357,0.0294428387543,0.0647815568887,0.0564883632182,-0.105579244454,0.00852526947916,0.00525727645833,-0.0596530787731,-0.0415343964404,0.11638812774,0.0968259524311,0.208324958134,-0.00117460116655,-0.0505781712885,-0.0323338532613,-0.132213597077,-0.0048034943035,0.125680763706,0.00720316225617,0.0482061313933,-0.0819678955725,0.225878458733,-0.0487156611568,0.0450780570356,0.113355924008,-0.0311896686759,-0.0509124818306,0.036520188844,-0.0269550841676,-0.0107445622353,0.0824359871593,-0.0148080705468,0.0813362368022,0.00962483814534,0.0986313594561,0.0438001312644,0.0281565279201,0.023460809598,-0.0669994037871,-0.0285583255943,-0.0715752292781,-0.0355364452118,0.0191930116858,0.0161009483418,-0.0956563973472,0.141158315143,0.0294657105506,-0.0151626852378,0.0865943223035,-0.0415571028551,-0.0342841059177,-0.0237316978741,-0.0302115070006,-0.00182684799626,0.00168597665351,-0.0041243305443,0.0217597781446,-0.0846252980175,-0.013768106553,0.12940121048,0.0437548868609,0.0418497207055,0.123918612503,-0.0535942910674,0.0279919178596,0.0752968777908,-0.0366714491301,-0.00508041992326,0.0590876803126,-0.116625416506,-0.0194474547958,-0.0545133996421,0.023332139766,-0.014695114347], intercept=0.0)
"""
#print (log_model) 
#print (nbay)




svm_predictions = featurePV.map(lambda p: (p.label, svm_model.predict(p.features)))
svm_PErr = 100* svm_predictions.filter(lambda (v, p): v != p).count() / float(featurePV.count())
print "SVM Prediction Error: " + "%.3f" %(svm_PErr) + "%" 
print "SVM  Prediction Accuracy: " + "%.3f" % (100-svm_PErr) + "%" 



SVM Prediction Error: 12.950%
SVM  Prediction Accuracy: 87.050%


In [15]:
### Logistic Regression Classifier

In [16]:
# Log

log_predictions = featurePV.map(lambda p: (p.label, log_model.predict(p.features)))
log_PErr = 100* log_predictions.filter(lambda (v, p): v != p).count() / float(featurePV.count())
print "Log Regression Prediction Error: " + "%.3f" %(log_PErr) + "%" 
print "Log Regression  Prediction Accuracy: " + "%.3f" % (100-log_PErr) + "%"


Log Regression Prediction Error: 13.600%
Log Regression  Prediction Accuracy: 86.400%


In [17]:
### Naive Bayes Classifier

In [18]:
# Bayes
bayes_predictions = featurePV.map(lambda p : (nbay.predict(p.features), p.label))
bayes_PErr = 100.0 * bayes_predictions.filter(lambda (x, v): x != v).count() / featurePV.count()
print "Naive Bayes Prediction Error: " + "%.3f" %(bayes_PErr) + "%" 
print "Bayes Prediction Accuracy: " + "%.3f" % (100-bayes_PErr) + "%" 


Naive Bayes Prediction Error: 13.630%
Bayes Prediction Accuracy: 86.370%


## Summary of some results

In [19]:
"""
num Features = 200, models trained with 49994 *.87 datapoints

Training Accuracies: 

SVM: 
Log: 
Bayes:

SVM Prediction Accuracy: 12.960%
SVM  Prediction error: 87.040%


Log Regression Prediction Accuracy: 13.380%
Log Regression  Prediction error: 86.620%


Naive Bayes Prediction Accuracy: 86.490%
Bayes Prediction error: 13.510%



6k - num Features:200

Training Accuracies: 

SVM: SVM model: 84.997 (with testing sets)
Log: Log Testing Accuracy: 84.292%
Bayes: Naive Bayes Testing Accuracy: 85.145%


SVM  Prediction Accuracy: 84.290%
Log Regression  Prediction Accuracy: 84.280%
Bayes Prediction Accuracy: 85.880%



50k - num Features 200
SVM: testing accuracy of the SVM model: 85.708
Log:Log Testing Accuracy: 84.816%
Bayes: Naive Bayes Testing Accuracy: 84.779%

SVM  Prediction Accuracy: 87.100%
Log Regression  Prediction Accuracy: 86.490%
Bayes Prediction Accuracy: 86.490%




100k - num Features 200
SVM: testing accuracy of the SVM model: 85.435
Log: Log Testing Accuracy: 84.772%
Bayes: Naive Bayes Testing Accuracy: 84.251%


SVM  Prediction Accuracy: 87.050%
Log Regression  Prediction Accuracy: 86.400%
Bayes Prediction Accuracy: 86.370%
"""

'\nnum Features = 200, models trained with 49994 *.87 datapoints\n\nTraining Accuracies: \n\nSVM: \nLog: \nBayes:\n\nSVM Prediction Accuracy: 12.960%\nSVM  Prediction error: 87.040%\n\n\nLog Regression Prediction Accuracy: 13.380%\nLog Regression  Prediction error: 86.620%\n\n\nNaive Bayes Prediction Accuracy: 86.490%\nBayes Prediction error: 13.510%\n\n\n\n6k - num Features:200\n\nTraining Accuracies: \n\nSVM: SVM model: 84.997 (with testing sets)\nLog: Log Testing Accuracy: 84.292%\nBayes: Naive Bayes Testing Accuracy: 85.145%\n\n\nSVM  Prediction Accuracy: 84.290%\nLog Regression  Prediction Accuracy: 84.280%\nBayes Prediction Accuracy: 85.880%\n\n\n\n50k - num Features 200\nSVM: testing accuracy of the SVM model: 85.708\nLog:Log Testing Accuracy: 84.816%\nBayes: Naive Bayes Testing Accuracy: 84.779%\n\nSVM  Prediction Accuracy: 87.100%\nLog Regression  Prediction Accuracy: 86.490%\nBayes Prediction Accuracy: 86.490%\n'