In [1]:
import sqlite3
from pyspark.context import SparkContext
from os import path
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [2]:
sc = SparkContext.getOrCreate()

In [3]:
alltweets = []

## Connecting to the database

In [4]:
def create_connection(database):
    try:
        conn = sqlite3.connect(database)
        return conn
    except Exception as e:
        print(e)

    return None

In [5]:
def getTweets(username): 
    sql = '''select tweet_text from tweets where user_screen_name=?'''
    database = "../twitterApp/twitter.sqlite"
    conn = create_connection(database)
    
    cur = conn.cursor()
    cur.execute(sql,(username,))
    
    tweets = cur.fetchall()
    
    usertweets = []
    
    for tweet in tweets: 
        
        usertweets.append((tweet[0]))
    
    #alltweets.extend(' '.join(usertweets))
    
    return usertweets

In [6]:
def getTrueNegatives(database):
    conn = create_connection(database)
    sql = '''select user_screen_name from search_results where isDepressed=0 ORDER BY ROWID DESC LIMIT 322'''
    cur = conn.cursor()
    cur.execute(sql)
    users = cur.fetchall()
    
    return users
    #for user in users: 
    #   user_tweets = getTweets(user)

In [7]:
def getTruePositives(database):    
    conn = create_connection(database)
    sql = '''select user_screen_name from search_results where isDepressed="True"'''
    cur = conn.cursor()
    cur.execute(sql)
    users = cur.fetchall()
    
    return users
    #for user in users: 
    #    user_tweets = getTweets(str(user))

In [8]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

def gettfidf(usertweets): 
    temp_df = spark.createDataFrame(usertweets,['tweet'])
    tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
    wordsData = tokenizer.transform(temp_df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    return rescaledData.select("features")

In [9]:
test = getTruePositives("../twitterApp/twitter.sqlite")

In [10]:
alltweets = []

In [11]:
def remove_nonascii(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])

In [12]:
for user in test: 
    t = getTweets(user[0])    
    alltweets.append(remove_nonascii(' '.join(t)))

## Generating the RDDs for users

In [13]:
trueNegatives = getTrueNegatives("../twitterApp/twitter.sqlite")
truePositives = getTruePositives("../twitterApp/twitter.sqlite")

In [14]:
alltweets = []

In [15]:
for user in trueNegatives: 
    t = getTweets(user[0]) 
    alltweets.append([user[0], 0, remove_nonascii(' '.join(t))])

for user in truePositives:
    t = getTweets(user[0])
    alltweets.append([user[0], 1, remove_nonascii(' '.join(t))]) 

In [16]:
tweetRDD = sc.parallelize(alltweets).cache()

In [17]:
tweetDF = tweetRDD.toDF(["username","label","tweet"])

In [18]:
tweetDF.printSchema()

root
 |-- username: string (nullable = true)
 |-- label: long (nullable = true)
 |-- tweet: string (nullable = true)



## Computing TF-IDF

In [19]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(tweetDF)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
tfidftweets = idfModel.transform(featurizedData)

In [20]:
tfidftweets.show()

+---------------+-----+--------------------+--------------------+--------------------+--------------------+
|       username|label|               tweet|               words|         rawFeatures|            features|
+---------------+-----+--------------------+--------------------+--------------------+--------------------+
|   PopoffSierra|    0|RT @jacksfilms: "...|[rt, @jacksfilms:...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|      dhenwikan|    0|Carnival and He -...|[carnival, and, h...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
| JosephineAlice|    0|Guess I'll just r...|[guess, i'll, jus...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|   JoseJaimes95|    0|Being able to mut...|[being, able, to,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|      lyriczbot|    0|I miss that happy...|[i, miss, that, h...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|  grillmeeting1|    0|@1HkipUrdKrXXzdk ...|[@1hkipurdkrxxzdk...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|bustercalamity1|    0|@marc

## Normalizing the vectors

In [21]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(tfidftweets)
print("Normalized using L^1 norm")
l1NormData.select("normFeatures").show(truncate=False)

Normalized using L^1 norm
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|normFeatures                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+-----------------------------

## Helper function

In [22]:
from __future__ import division
def measures(tp,tn,fp,fn):
    try: 
        precision = tp/(tp+fp)
    except ZeroDivisionError as e: 
        precision = -1
    
    try:
        recall = tp/(tp+fn)
    except ZeroDivisionError as e: 
        recall = -1   
    
    try:
        specificity = tn/(tn+fp)
    except ZeroDivisionError as e: 
        specificity = -1
    
    try:
        accuracy = (tp+tn)/(tp+tn+fp+fn)
    except ZeroDivisionError as e: 
        accuracy = -1
    
    try:
        f1_score = (2*tp)/((2*tp)+fp+fn)
    except ZeroDivisionError as e: 
        f1_score = -1
    return precision, recall, specificity, accuracy, f1_score

def howgoodisit(result):
    true_positives = result.filter(lambda line: line[1]==0.0 and line[0]==0.0).count()
    true_negatives = result.filter(lambda line: line[1]==1.0 and line[0]==1.0).count()
    false_positives = result.filter(lambda line: line[1]==0.0 and line[0]==1.0).count()
    false_negatives = result.filter(lambda line: line[1]==1.0 and line[0]==0.0).count()
    return measures(true_positives, true_negatives, false_positives, false_negatives)

## Evaluating NormFeatures and TF-IDF features

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [24]:
splits = l1NormData.randomSplit([0.6, 0.4])
training_df = splits[0]
test_df = splits[1]

## Logistic Regression

In [25]:
lr = (LogisticRegression()).setMaxIter(1000).setRegParam(0.01)

lr_pipeline = Pipeline(stages=[lr])

lr_paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.1, 0.5,1.0, 2.0]).build()

lr_cv = CrossValidator(estimator=lr_pipeline, 
                    estimatorParamMaps=lr_paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

## Evaluating TF-IDF features

In [26]:
trainData = training_df.select("label", "features")
testData = test_df.select("label", "features")

In [27]:
lr_cvModel1 = lr_cv.fit(trainData)
lr_result1 = lr_cvModel1.transform(testData)

In [28]:
lr_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator.evaluate(lr_result1, {lr_evaluator.metricName: "accuracy"})

0.6395348837209303

In [29]:
result = lr_result1.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.65811965812
recall:  0.592307692308
specificity:  0.6875
accuracy:  0.639534883721
f1_score:  0.623481781377


## Evaluating L1 Norm features

In [30]:
L1trainData = training_df.selectExpr("label", "normFeatures as features")
L1testData = test_df.selectExpr("label", "normFeatures as features")

In [31]:
lr_cvModel2 = lr_cv.fit(L1trainData)
lr_result2 = lr_cvModel2.transform(L1testData)

In [32]:
lr_evaluator_norm = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_norm.evaluate(lr_result2, {lr_evaluator_norm.metricName: "accuracy"})

0.6124031007751938

In [33]:
result = lr_result2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.644230769231
recall:  0.515384615385
specificity:  0.7109375
accuracy:  0.612403100775
f1_score:  0.57264957265


## Naive Bayes

In [34]:
nb = NaiveBayes()
pipeline = Pipeline(stages=[nb])
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()


cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

## Evaluating TF-IDF Features

In [35]:
nb_model1 = cv.fit(trainData)
nb_result = nb_model1.transform(testData)

In [36]:
nb_evaluator_1 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_1.evaluate(nb_result, {nb_evaluator_1.metricName: "accuracy"})

0.5038759689922481

In [37]:
result = nb_result.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.50495049505
recall:  0.784615384615
specificity:  0.21875
accuracy:  0.503875968992
f1_score:  0.614457831325


## Evaluating L1-Norm Features

In [38]:
nb_model2 = cv.fit(L1trainData)
nb_result_2 = nb_model2.transform(L1testData)

In [39]:
nb_evaluator_2 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_2.evaluate(nb_result_2, {nb_evaluator_2.metricName: "accuracy"})

0.562015503875969

In [40]:
result = nb_result_2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  1.0
recall:  0.130769230769
specificity:  1.0
accuracy:  0.562015503876
f1_score:  0.231292517007


## Decision Tree 

In [41]:
dt = DecisionTreeClassifier()


dt_pipeline = Pipeline(stages=[dt])
dt_paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1,2,6,10])
             .addGrid(dt.maxBins, [20,40,80])
             .build())

dt_cv = CrossValidator(estimator=dt_pipeline, 
                    estimatorParamMaps=dt_paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

## Evaluating TF-IDF Features

In [42]:
dt_model_1 = dt_cv.fit(trainData)
dt_result_1 = dt_model_1.transform(testData)

In [43]:
dt_evaluator_1 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_1.evaluate(dt_result_1, {dt_evaluator_1.metricName: "accuracy"})

0.6162790697674418

In [44]:
result = dt_result_1.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.701298701299
recall:  0.415384615385
specificity:  0.8203125
accuracy:  0.616279069767
f1_score:  0.521739130435


## Evaluating L1-Norm features

In [45]:
dt_model_2 = dt_cv.fit(L1trainData)
dt_result_2 = dt_model_2.transform(L1testData)

In [46]:
dt_evaluator_2 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_2.evaluate(dt_result_2, {dt_evaluator_2.metricName: "accuracy"})

0.6124031007751938

In [47]:
result = dt_result_2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.678571428571
recall:  0.438461538462
specificity:  0.7890625
accuracy:  0.612403100775
f1_score:  0.532710280374


## MLP 

In [48]:
layers = [20, 5, 4, 2]

mlp = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

mlp_pipeline = Pipeline(stages=[mlp])

## Evaluating TF-IDF Features

In [49]:
mlp_model_1 = mlp_pipeline.fit(trainData)
mlp_result_1 = mlp_model_1.transform(testData)

In [50]:
mlp_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator.evaluate(mlp_result_1, {mlp_evaluator.metricName: "accuracy"})

0.6434108527131783

In [51]:
result = mlp_result_1.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.88
recall:  0.338461538462
specificity:  0.953125
accuracy:  0.643410852713
f1_score:  0.488888888889


## Evaluating L1-Norm Features

In [52]:
mlp_model_2 = mlp_pipeline.fit(L1trainData)
mlp_result_2 = mlp_model_2.transform(L1testData)

In [53]:
mlp_evaluator_2 = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator_2.evaluate(mlp_result_2, {mlp_evaluator_2.metricName: "accuracy"})

0.627906976744186

In [54]:
result = mlp_result_2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.72972972973
recall:  0.415384615385
specificity:  0.84375
accuracy:  0.627906976744
f1_score:  0.529411764706


In [55]:
from sgd import learn_coefficients
import numpy as np

## Converting the different classifier outputs to a numpy array

In [56]:
lr_result1.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|[-0.8437330355307...|[0.30074914654175...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|[0.93868944850744...|[0.71883485612439...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[0.32067754762303...|[0.57948936681598...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[0.13720130338945...|[0.53424662059814...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-0.9003932513255...|[0.28896969081117...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|[0.22174876564397...|[0.55521113739957...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1.2276253652736...|[0.22659731256865...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|[0.12983556453539...|[0.53241337049971...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[1.74955748884830...|[0.85189697968659...|       0.0|
|    0|(20,[0,1,

In [57]:
nb_result.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|[-1113.6828096638...|[0.89742907694073...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1238.7627047954...|[0.99882790974169...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1000.4753511161...|[0.97429750768652...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1386.7957326956...|[0.99971642792676...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1635.2350209786...|[0.99235526976524...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-672.62129385577...|[0.96186675165198...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1593.0010655531...|[0.87088902658618...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-993.18881245899...|[0.99447778678775...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1169.9255560422...|[0.95191204764108...|       0.0|
|    0|(20,[0,1,

In [58]:
dt_result_1.show()

+-----+--------------------+-------------+--------------------+----------+
|label|            features|rawPrediction|         probability|prediction|
+-----+--------------------+-------------+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|    [2.0,3.0]|           [0.4,0.6]|       1.0|
|    0|(20,[0,1,2,3,4,5,...|   [11.0,0.0]|           [1.0,0.0]|       0.0|
|    0|(20,[0,1,2,3,4,5,...|   [21.0,0.0]|           [1.0,0.0]|       0.0|
|    0|(20,[0,1,2,3,4,5,...|   [0.0,20.0]|           [0.0,1.0]|       1.0|
|    0|(20,[0,1,2,3,4,5,...|    [5.0,6.0]|[0.45454545454545...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|  [70.0,89.0]|[0.44025157232704...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|   [12.0,0.0]|           [1.0,0.0]|       0.0|
|    0|(20,[0,1,2,3,4,5,...|  [11.0,12.0]|[0.47826086956521...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|   [21.0,0.0]|           [1.0,0.0]|       0.0|
|    0|(20,[0,1,2,3,4,5,...|   [21.0,0.0]|           [1.0,0.0]|       0.0|
|    0|(20,[0,1,2,3,4,5,.

In [59]:
mlp_result_1.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,4,5,6,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



## Linear Combination of Classifiers

In [60]:
label_column = lr_result1.select("label").collect()
lr_column = lr_result1.select("prediction").collect()
mlp_column = mlp_result_1.select("prediction").collect()
nb_column = nb_result.select("prediction").collect()
dt_column = dt_result_1.select("prediction").collect()

In [61]:
temp_label = [i[0] for i in label_column]
temp_lr = [i[0] for i in lr_column]
temp_mlp = [i[0] for i in mlp_column]
temp_nb = [i[0] for i in nb_column]
temp_dt = [i[0] for i in dt_column]

In [62]:
X = np.array([temp_lr,temp_mlp, temp_nb, temp_dt]).transpose()
y = np.array([temp_label]).transpose()

In [63]:
learned_weights = learn_coefficients(X, y, 0.001, 1000)

In [64]:
print "Learned Weights for(without using normalized features):"
print "Logistic Regression", learned_weights[0]/sum(learned_weights)
print "Multilayer Perceptron", learned_weights[1]/sum(learned_weights)
print "Naive Bayes", learned_weights[2]/sum(learned_weights)
print "Decision Tree", learned_weights[3]/sum(learned_weights)

Learned Weights for(without using normalized features):
Logistic Regression -0.34352507371
Multilayer Perceptron 0.531763045663
Naive Bayes -0.00615466342636
Decision Tree 0.817916691474


## Combining the classifier outputs with the help of another classifier

In [65]:
#Using sklearn SVM here as Spark doesn't support SVM with RBF
from sklearn import svm

In [66]:
svm_rbf = svm.SVC()
svm_rbf.fit(X,np.ravel(y))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [67]:
svm_rbf_predictions = svm_rbf.predict(X)

In [68]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y, svm_rbf_predictions).ravel()

precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.64238410596
recall:  0.7578125
specificity:  0.584615384615
accuracy:  0.670542635659
f1_score:  0.695340501792


## combination of classifiers with logistic regression classifier

In [69]:
from sklearn import linear_model

In [70]:
meta_lr = linear_model.LogisticRegression(solver='lbfgs')
meta_lr.fit(X, np.ravel(y))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [71]:
meta_lr_predictions = meta_lr.predict(X)

In [72]:
tn, fp, fn, tp = confusion_matrix(y, meta_lr_predictions).ravel()

precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.596590909091
recall:  0.8203125
specificity:  0.453846153846
accuracy:  0.635658914729
f1_score:  0.690789473684


In [73]:
meta_learned_weights = meta_lr.coef_

print "Learned Weights for classifiers using logistic regression as the meta-classifier:"
print "Logistic Regression", meta_learned_weights[0][0]/sum(meta_learned_weights[0])
print "Multilayer Perceptron", meta_learned_weights[0][1]/sum(meta_learned_weights[0])
print "Naive Bayes", meta_learned_weights[0][2]/sum(meta_learned_weights[0])
print "Decision Tree", meta_learned_weights[0][3]/sum(meta_learned_weights[0])

Learned Weights for classifiers using logistic regression as the meta-classifier:
Logistic Regression 0.260974306538
Multilayer Perceptron 0.595458210065
Naive Bayes -0.0721486597383
Decision Tree 0.215716143136


## Using User Specific Features with the classifiers

In [74]:
def fetch_user_data(username): 
    conn = create_connection("../twitterApp/twitter.sqlite")
    sql = '''select num_status, num_friends, num_followers, isDepressed from users where user_screen_name=?'''
    cur = conn.cursor()
    cur.execute(sql,(username,))
    
    a = cur.fetchall()
    
    return [a[0][0], a[0][1], a[0][2], a[0][3]]

In [75]:
all_users = (trueNegatives + truePositives)

In [76]:
userRDD = sc.parallelize(all_users).map(lambda line: line[0])

In [77]:
userRDD.take(5)

[u'PopoffSierra',
 u'dhenwikan',
 u'JosephineAlice',
 u'JoseJaimes95',
 u'lyriczbot']

In [78]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler

In [79]:
userDataRDD = userRDD.map(lambda line : [str(line), fetch_user_data(str(line))])\
                     .map(lambda line: [line[0], line[1][:3], line[1][-1]])\
                     .map(lambda line: [line[0], Vectors.dense(line[1]), line[2]])
                    

In [80]:
userDataRDD.take(10)

[['PopoffSierra', DenseVector([602.0, 100.0, 22.0]), 0],
 ['dhenwikan', DenseVector([935.0, 241.0, 234.0]), 0],
 ['JosephineAlice', DenseVector([50516.0, 828.0, 2066.0]), 0],
 ['JoseJaimes95', DenseVector([11049.0, 279.0, 360.0]), 0],
 ['lyriczbot', DenseVector([89519.0, 5.0, 20.0]), 0],
 ['grillmeeting1', DenseVector([99.0, 90.0, 4.0]), 0],
 ['bustercalamity1', DenseVector([99.0, 104.0, 5.0]), 0],
 ['_Coopavelli', DenseVector([223653.0, 2309.0, 3683.0]), 0],
 ['LeahEbooks', DenseVector([10578.0, 1.0, 28.0]), 0],
 ['KeyOfConceit', DenseVector([90.0, 206.0, 1571.0]), 0]]

In [81]:
userDataDF = spark.createDataFrame(userDataRDD, ["user","features", "label"])
userDataDF.printSchema()

root
 |-- user: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: long (nullable = true)



In [82]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(userDataDF)
scaledData = scalerModel.transform(userDataDF)
scaledData.show()

+---------------+--------------------+-----+--------------------+
|           user|            features|label|      scaledFeatures|
+---------------+--------------------+-----+--------------------+
|   PopoffSierra|  [602.0,100.0,22.0]|    0|[-0.5511433475228...|
|      dhenwikan| [935.0,241.0,234.0]|    0|[-0.5434584724786...|
| JosephineAlice|[50516.0,828.0,20...|    0|[0.60075711180434...|
|   JoseJaimes95|[11049.0,279.0,36...|    0|[-0.3100505860012...|
|      lyriczbot|  [89519.0,5.0,20.0]|    0|[1.50085675549429...|
|  grillmeeting1|     [99.0,90.0,4.0]|    0|[-0.5627514320490...|
|bustercalamity1|    [99.0,104.0,5.0]|    0|[-0.5627514320490...|
|    _Coopavelli|[223653.0,2309.0,...|    0|[4.59636134762072...|
|     LeahEbooks|  [10578.0,1.0,28.0]|    0|[-0.3209201840367...|
|   KeyOfConceit| [90.0,206.0,1571.0]|    0|[-0.5629591313746...|
| goofy_goober33|      [1.0,25.0,2.0]|    0|[-0.5650130469269...|
|      kalenstar|[28756.0,500.0,79...|    0|[0.09858629810557...|
|        V

## Logistic Regression

In [83]:
userSplits = scaledData.randomSplit([0.6, 0.4])
userTraindf = userSplits[0]
userTestdf = userSplits[1]

In [84]:
userTrainData = userTraindf.select("label", "features")
userTestData = userTestdf.select("label", "features")
userTrainDataScaled = userTraindf.selectExpr("label", "scaledFeatures as features")
userTestDataScaled = userTestdf.selectExpr("label", "scaledFeatures as features")

## Evaluating unscaled features

In [85]:
lr_cvModel_3 = lr_cv.fit(userTrainData)
lr_result_3 = lr_cvModel_3.transform(userTestData)

In [86]:
lr_evaluator_3 = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_3.evaluate(lr_result_3, {lr_evaluator_3.metricName: "accuracy"})

0.5

In [87]:
result = lr_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.490476190476
recall:  0.792307692308
specificity:  0.224637681159
accuracy:  0.5
f1_score:  0.605882352941


## Evaluating Scaled features

In [88]:
lr_cvModel_4 = lr_cv.fit(userTrainDataScaled)
lr_result_4 = lr_cvModel_4.transform(userTestDataScaled)

In [89]:
lr_evaluator_4 = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_4.evaluate(lr_result_4, {lr_evaluator_4.metricName: "accuracy"})

0.5

In [90]:
result = lr_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.490476190476
recall:  0.792307692308
specificity:  0.224637681159
accuracy:  0.5
f1_score:  0.605882352941


## Naive Bayes

## Evaluating unscaled features

In [91]:
nb_model_3 = cv.fit(userTrainData)
nb_result_3 = nb_model_3.transform(userTestData)

In [92]:
nb_evaluator_3 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_3.evaluate(nb_result_3, {nb_evaluator_3.metricName: "accuracy"})

0.5111940298507462

In [93]:
result = nb_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.497942386831
recall:  0.930769230769
specificity:  0.115942028986
accuracy:  0.511194029851
f1_score:  0.648793565684


## Evaluating Scaled features

In [94]:
nb_model_4 = cv.fit(userTrainDataScaled)
nb_result_4 = nb_model_4.transform(userTestDataScaled)

In [95]:
nb_evaluator_4 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_4.evaluate(nb_result_4, {nb_evaluator_4.metricName: "accuracy"})

0.5111940298507462

In [96]:
result = nb_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.497942386831
recall:  0.930769230769
specificity:  0.115942028986
accuracy:  0.511194029851
f1_score:  0.648793565684


## MLP

In [97]:
layers_2 = [3, 5, 4, 2]

mlp_2 = MultilayerPerceptronClassifier(maxIter=100, layers=layers_2, blockSize=128, seed=1234)

mlp_pipeline_2 = Pipeline(stages=[mlp_2])

## Evaluating unscaled features

In [98]:
mlp_model_3 = mlp_pipeline_2.fit(userTrainData)
mlp_result_3 = mlp_model_3.transform(userTestData)

In [99]:
mlp_evaluator_3 = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator_3.evaluate(mlp_result_3, {mlp_evaluator_3.metricName: "accuracy"})

0.6119402985074627

In [100]:
result = mlp_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.691176470588
recall:  0.361538461538
specificity:  0.847826086957
accuracy:  0.611940298507
f1_score:  0.474747474747


## Evaluating Scaled features

In [101]:
mlp_model_4 = mlp_pipeline_2.fit(userTrainDataScaled)
mlp_result_4 = mlp_model_4.transform(userTestDataScaled)

In [102]:
mlp_evaluator_4 = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator_4.evaluate(mlp_result_4, {mlp_evaluator_4.metricName: "accuracy"})

0.585820895522388

In [103]:
result = mlp_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.569343065693
recall:  0.6
specificity:  0.572463768116
accuracy:  0.585820895522
f1_score:  0.584269662921


## Decision Tree

## Evaluating unscaled features

In [104]:
dt_model_3 = dt_cv.fit(userTrainData)
dt_result_3 = dt_model_3.transform(userTestData)

In [105]:
dt_evaluator_3 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_3.evaluate(dt_result_3, {dt_evaluator_3.metricName: "accuracy"})

0.6492537313432836

In [106]:
result = dt_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.645161290323
recall:  0.615384615385
specificity:  0.68115942029
accuracy:  0.649253731343
f1_score:  0.629921259843


## Evaluating scaled features

In [107]:
dt_model_4 = dt_cv.fit(userTrainDataScaled)
dt_result_4 = dt_model_4.transform(userTestDataScaled)

In [108]:
dt_evaluator_4 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_4.evaluate(dt_result_4, {dt_evaluator_4.metricName: "accuracy"})

0.6492537313432836

In [109]:
result = dt_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.645161290323
recall:  0.615384615385
specificity:  0.68115942029
accuracy:  0.649253731343
f1_score:  0.629921259843


## Linear Combination of classifiers through SGD

In [110]:
label_column = lr_result_3.select("label").collect()
lr_column = lr_result_3.select("prediction").collect()
mlp_column = mlp_result_3.select("prediction").collect()
nb_column = nb_result_3.select("prediction").collect()
dt_column = dt_result_3.select("prediction").collect()

In [111]:
temp_label = [i[0] for i in label_column] 
temp_lr = [i[0] for i in lr_column]
temp_mlp = [i[0] for i in mlp_column] 
temp_nb = [i[0] for i in nb_column] 
temp_dt = [i[0] for i in dt_column]

In [112]:
X = np.array([temp_lr,temp_mlp, temp_nb, temp_dt]).transpose()
y = np.array([temp_label]).transpose()

In [113]:
learned_weights = learn_coefficients(X, y, 0.001, 1000)

In [114]:
print "Learned Weights for(without using normalized features):"
print "Logistic Regression", learned_weights[0]/sum(learned_weights) 
print "Multilayer Perceptron", learned_weights[1]/sum(learned_weights) 
print "Naive Bayes", learned_weights[2]/sum(learned_weights)
print "Decision Tree", learned_weights[3]/sum(learned_weights)

Learned Weights for(without using normalized features):
Logistic Regression -1.46949953889
Multilayer Perceptron 0.306251618409
Naive Bayes 0.387278623288
Decision Tree 1.77596929719


## Using SVM Classifier

In [115]:
svm_rbf.fit(X, np.ravel(y))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [116]:
svm_rbf_predictions_2 = svm_rbf.predict(X)

In [117]:
tn, fp, fn, tp = confusion_matrix(y, svm_rbf_predictions_2).ravel()
precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.652777777778
recall:  0.68115942029
specificity:  0.615384615385
accuracy:  0.649253731343
f1_score:  0.666666666667


## Using Logistic Regression

In [118]:
meta_lr_2 = linear_model.LogisticRegression(solver='lbfgs')
meta_lr_2.fit(X, y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [119]:
meta_lr_predictions_2 = meta_lr_2.predict(X)

In [120]:
tn, fp, fn, tp = confusion_matrix(y, meta_lr_predictions_2).ravel()
precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.654411764706
recall:  0.644927536232
specificity:  0.638461538462
accuracy:  0.641791044776
f1_score:  0.649635036496


In [121]:
meta_learned_weights_2 = meta_lr_2.coef_

In [122]:
print "Learned Weights for classifiers using logistic regression as the meta-classifier:"
print "Logistic Regression", meta_learned_weights[0][0]/sum(meta_learned_weights[0])
print "Multilayer Perceptron", meta_learned_weights[0][1]/sum(meta_learned_weights[0])
print "Naive Bayes", meta_learned_weights[0][2]/sum(meta_learned_weights[0])
print "Decision Tree", meta_learned_weights[0][3]/sum(meta_learned_weights[0])

Learned Weights for classifiers using logistic regression as the meta-classifier:
Logistic Regression 0.260974306538
Multilayer Perceptron 0.595458210065
Naive Bayes -0.0721486597383
Decision Tree 0.215716143136


## Combining the user-specific and TF-IDF features

In [123]:
from pyspark.ml.feature import VectorAssembler

In [124]:
l1NormData.printSchema()

root
 |-- username: string (nullable = true)
 |-- label: long (nullable = true)
 |-- tweet: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- normFeatures: vector (nullable = true)



In [126]:
scaledData.printSchema()

root
 |-- user: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: long (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [128]:
l1NormData.createOrReplaceTempView("tweetfeatures")
scaledData.createOrReplaceTempView("userfeatures")

In [129]:
dataset = spark.sql("select a.username, a.label, a.features as tweetfeatures, a.normFeatures, b.features as userfeatures, b.scaledFeatures from tweetfeatures as a, userfeatures as b where a.username=b.user")

In [130]:
dataset.show()

+---------------+-----+--------------------+--------------------+--------------------+--------------------+
|       username|label|       tweetfeatures|        normFeatures|        userfeatures|      scaledFeatures|
+---------------+-----+--------------------+--------------------+--------------------+--------------------+
|    TeamGeekFam|    0|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|   [336.0,17.0,33.0]|[-0.5572820164771...|
| ajrandomtweets|    0|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[10418.0,203.0,25.0]|[-0.3246126164903...|
|       seoulazy|    1|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[32655.0,1542.0,5...|[0.18856626146088...|
|     uV_Fanatic|    1|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[14838.0,237.0,65...|[-0.2226091699578...|
|       Dvnyells|    1|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[20134.0,242.0,36...|[-0.1003896557414...|
|     AxelLove21|    1|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|    [4587.0,0.0,0.0]|[-0.4591787017237...|
|   polnareffbot|    0|(20,[

In [218]:
from pyspark.ml.feature import VectorAssembler
assembler_1 = VectorAssembler(
    inputCols=["tweetfeatures", "userfeatures"], outputCol="features")
assembler_2 = VectorAssembler(
    inputCols=["normFeatures", "scaledFeatures"], outputCol="standardizedfeatures")

assembling_pipeline = Pipeline(stages=[assembler_1, assembler_2])

In [219]:
combinedDataDF = assembler_2.transform(assembler_1.transform(dataset))

In [220]:
combinedDataDF.show()

+---------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|       username|label|       tweetfeatures|        normFeatures|        userfeatures|      scaledFeatures|            features|standardizedfeatures|
+---------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    TeamGeekFam|    0|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|   [336.0,17.0,33.0]|[-0.5572820164771...|[3.22958828345280...|[0.05115337139413...|
| ajrandomtweets|    0|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[10418.0,203.0,25.0]|[-0.3246126164903...|[38.3270416771206...|[0.05686493046418...|
|       seoulazy|    1|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[32655.0,1542.0,5...|[0.18856626146088...|[10.7627002313619...|[0.04181716542838...|
|     uV_Fanatic|    1|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[14838.0,237.0,65...|[-0.2226091699

In [168]:
#csvDatasetRDD = datasetRDD.map(lambda line: [line[0], line[1], line[2].values.tolist() + line[4].values.tolist(), line[3].values.tolist() + line[5].values.tolist()])

In [145]:
#csvDatasetRDD.saveAsTextFile("dataset")

In [150]:
#def toCSVLine(data):
#    return ','.join(str(d) for d in data)

#lines = csvDatasetRDD.map(toCSVLine).collect()

In [152]:
#with open("dataset.csv", "a+") as f: 
#    for line in lines: 
#        f.write(line+"\n")

In [214]:
#from pyspark.ml.linalg import DenseVector, SparseVector
#from pyspark.ml.linalg import Vectors
#from pyspark.ml.feature import VectorAssembler

In [215]:
#combinedDataRDD = csvDatasetRDD.map(lambda line: [line[0], line[1], Vectors.sparse(line[2]), Vectors.sparse(line[3])])

In [216]:
#combinedDataDF = spark.createDataFrame(combinedDataRDD, ["username", "label", "features", "normFeatures"])

## Evaluating Combined Features

In [249]:
combined_splits = combinedDataDF.randomSplit([0.6, 0.4])

combinedTrainData = combined_splits[0].select("label", "features")
combinedTestData = combined_splits[1].select("label", "features")
combinedTrainDataScaled = combined_splits[0].selectExpr("label", "standardizedfeatures as features")
combinedTestDataScaled = combined_splits[1].selectExpr("label", "standardizedfeatures as features")

In [222]:
combinedTrainData.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[3.22958828345280...|
|    1|[10.7627002313619...|
|    1|[0.14007852795698...|
|    0|[12.2490890557944...|
|    1|[0.07003926397849...|
|    1|[17.5876373990441...|
|    1|[19.2685797345280...|
|    0|[11.9378034381122...|
|    0|[0.40467130298685...|
|    1|[12.1323569491636...|
|    1|[11.2841036409796...|
|    0|[8.80938298040620...|
|    1|[1.70428875681003...|
|    0|[5.43193402855435...|
|    0|[1.48638882443249...|
|    1|[21.3386290921146...|
|    1|[2.38911711571086...|
|    0|[0.44358200519713...|
|    1|[16.6070877033452...|
|    1|[1.57199236929509...|
+-----+--------------------+
only showing top 20 rows



In [223]:
combinedTestData.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[38.3270416771206...|
|    1|[11.8288534719235...|
|    1|[11.5953892586618...|
|    1|[10.5370181585424...|
|    0|[18.6226620778374...|
|    0|[8.70821515465948...|
|    1|[14.7238097163679...|
|    1|[5.13621269175626...|
|    1|[20.1090509022699...|
|    0|[12.0311891234169...|
|    1|[14.6615525928315...|
|    1|[14.1712777449820...|
|    1|[0.70039263978494...|
|    1|[22.5370787201911...|
|    0|[3.80546667616487...|
|    0|[10.3113360857228...|
|    1|[11.6265178204300...|
|    0|[15.4086380752687...|
|    0|[6.58369081397848...|
|    1|[11.1128965512544...|
+-----+--------------------+
only showing top 20 rows



In [248]:
combinedTrainDataScaled.show(truncate=False)

+-----+-----------------------------------------------------------------+
|label|features                                                         |
+-----+-----------------------------------------------------------------+
|0    |[-0.557282016477104,-0.20716859902950865,-0.131217318563865]     |
|1    |[0.1885662614608838,0.05202307605378059,0.07912775883712071]     |
|1    |[-0.4913720971791401,-0.20920814007934438,-0.1323081639223465]   |
|0    |[-0.3116429474968774,-0.15855953734175735,-0.1262520914149147]   |
|1    |[-0.5648515030071222,-0.20631879025874378,-0.13223293320796847]  |
|1    |[-0.33268981248278184,-0.1162390605576662,-0.11989509604997076]  |
|1    |[-0.4997262256055627,0.6399207836689266,-0.04646991881700864]    |
|0    |[-0.29089609264783345,-0.052503402750299986,-0.12497316927048811]|
|0    |[-0.562382188803732,-0.18609334151453957,-0.13200724106483439]   |
|1    |[0.38624986394910443,0.7981551767853476,0.3085438223329383]      |
|1    |[-0.06452690553518389,-0.108760

In [225]:
combinedTestDataScaled.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[-0.3246126164903...|
|    1|[-0.2226091699578...|
|    1|[-0.1003896557414...|
|    1|[-0.4591787017237...|
|    0|[-0.4043691574896...|
|    0|[-0.3304051199020...|
|    1|[-0.0912739631214...|
|    1|[-0.5300964825369...|
|    1|[-0.3505750321802...|
|    0|[0.03805348356826...|
|    1|[3.87857555633060...|
|    1|[-0.3564829241061...|
|    1|[-0.5620360232612...|
|    1|[1.29518826782501...|
|    0|[-0.5474970704748...|
|    0|[-0.3544520862565...|
|    1|[0.25950711997696...|
|    0|[-0.3902225256514...|
|    0|[-0.5250655433188...|
|    1|[0.21448252174506...|
+-----+--------------------+
only showing top 20 rows



## Logistic Regression

### Evaluating unscaled features

In [231]:
lr_cvModel_5 = lr_cv.fit(combinedTrainData)
lr_result_5 = lr_cvModel_5.transform(combinedTestData)

In [232]:
lr_evaluator_5 = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_5.evaluate(lr_result_5, {lr_evaluator_5.metricName: "accuracy"})

0.6498054474708171

In [233]:
result = lr_result_5.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.63503649635
recall:  0.685039370079
specificity:  0.615384615385
accuracy:  0.649805447471
f1_score:  0.659090909091


### Evaluating Scaled features

In [250]:
lr_cvModel_6 = lr_cv.fit(combinedTrainDataScaled)
lr_result_6 = lr_cvModel_6.transform(combinedTestDataScaled)

In [251]:
lr_evaluator_6 = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_6.evaluate(lr_result_6, {lr_evaluator_6.metricName: "accuracy"})

0.6206896551724138

In [252]:
result = lr_result_6.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.603773584906
recall:  0.581818181818
specificity:  0.655737704918
accuracy:  0.620689655172
f1_score:  0.592592592593


## MLP

In [238]:
layers_3 = [23, 5, 4, 2]

mlp_3 = MultilayerPerceptronClassifier(maxIter=100, layers=layers_3, blockSize=128, seed=1234)

mlp_pipeline_3 = Pipeline(stages=[mlp_3])

### Evaluating unscaled features

In [240]:
mlp_model_5 = mlp_pipeline_3.fit(combinedTrainData)
mlp_result_5 = mlp_model_5.transform(combinedTestData)

In [241]:
mlp_evaluator_5 = MulticlassClassificationEvaluator(predictionCol="prediction")
mlp_evaluator_5.evaluate(mlp_result_5, {mlp_evaluator_5.metricName: "accuracy"})

0.5408560311284046

In [242]:
result = mlp_result_5.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.8
recall:  0.0944881889764
specificity:  0.976923076923
accuracy:  0.540856031128
f1_score:  0.169014084507


### Evaluating Scaled features

In [253]:
mlp_model_6 = mlp_pipeline_3.fit(combinedTrainDataScaled)
mlp_result_6 = mlp_model_6.transform(combinedTestDataScaled)

In [254]:
mlp_evaluator_6 = MulticlassClassificationEvaluator(predictionCol="prediction")
mlp_evaluator_6.evaluate(mlp_result_6, {mlp_evaluator_6.metricName: "accuracy"})

0.6120689655172413

In [255]:
result = mlp_result_6.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.590909090909
recall:  0.590909090909
specificity:  0.631147540984
accuracy:  0.612068965517
f1_score:  0.590909090909


## Naive Bayes

### Evaluating unscaled features

In [244]:
nb_model_5 = cv.fit(combinedTrainData)
nb_result_5 = nb_model_5.transform(combinedTestData)

In [245]:
nb_evaluator_5 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_5.evaluate(nb_result_5, {nb_evaluator_5.metricName: "accuracy"})

0.5642023346303502

In [246]:
result = nb_result_5.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.549019607843
recall:  0.661417322835
specificity:  0.469230769231
accuracy:  0.56420233463
f1_score:  0.6


### Evaluating Scaled features

In [None]:
nb_model_6 = cv.fit(combinedTrainDataScaled)
nb_result_6 = nb_model_5.transform(combinedTestDataScaled)

In [None]:
nb_evaluator_6 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_6.evaluate(nb_result_6, {nb_evaluator_6.metricName: "accuracy"})

In [None]:
result = nb_result_6.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

## Decision Tree

### Evaluating unscaled features

In [263]:
dt_model_5 = dt_cv.fit(combinedTrainData)
dt_result_5 = dt_model_5.transform(combinedTestData)

In [264]:
dt_evaluator_5 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_5.evaluate(dt_result_5, {dt_evaluator_5.metricName: "accuracy"})

0.5991379310344828

In [265]:
result = dt_result_5.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.571428571429
recall:  0.618181818182
specificity:  0.581967213115
accuracy:  0.599137931034
f1_score:  0.593886462882


### Evaluating Scaled features

In [266]:
dt_model_6 = dt_cv.fit(combinedTrainDataScaled)
dt_result_6 = dt_model_6.transform(combinedTestDataScaled)

In [267]:
dt_evaluator_6 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_6.evaluate(dt_result_6, {dt_evaluator_6.metricName: "accuracy"})

0.6293103448275862

In [268]:
result = dt_result_5.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.571428571429
recall:  0.618181818182
specificity:  0.581967213115
accuracy:  0.599137931034
f1_score:  0.593886462882
