In [1]:
import sqlite3
from pyspark.context import SparkContext
from os import path
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [2]:
sc = SparkContext.getOrCreate()

In [3]:
alltweets = []

## Connecting to the database

In [4]:
def create_connection(database):
    try:
        conn = sqlite3.connect(database)
        return conn
    except Exception as e:
        print(e)

    return None

In [5]:
def getTweets(username): 
    sql = '''select tweet_text from tweets where user_screen_name=?'''
    database = "../twitterApp/twitter.sqlite"
    conn = create_connection(database)
    
    cur = conn.cursor()
    cur.execute(sql,(username,))
    
    tweets = cur.fetchall()
    
    usertweets = []
    
    for tweet in tweets: 
        
        usertweets.append((tweet[0]))
    
    #alltweets.extend(' '.join(usertweets))
    
    return usertweets

In [6]:
def getTrueNegatives(database):
    conn = create_connection(database)
    sql = '''select user_screen_name from search_results where isDepressed=0 ORDER BY ROWID DESC LIMIT 322'''
    cur = conn.cursor()
    cur.execute(sql)
    users = cur.fetchall()
    
    return users
    #for user in users: 
    #   user_tweets = getTweets(user)

In [7]:
def getTruePositives(database):    
    conn = create_connection(database)
    sql = '''select user_screen_name from search_results where isDepressed="True"'''
    cur = conn.cursor()
    cur.execute(sql)
    users = cur.fetchall()
    
    return users
    #for user in users: 
    #    user_tweets = getTweets(str(user))

In [8]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

def gettfidf(usertweets): 
    temp_df = spark.createDataFrame(usertweets,['tweet'])
    tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
    wordsData = tokenizer.transform(temp_df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    return rescaledData.select("features")

In [9]:
test = getTruePositives("../twitterApp/twitter.sqlite")

In [10]:
alltweets = []

In [11]:
def remove_nonascii(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])

In [12]:
for user in test: 
    t = getTweets(user[0])    
    alltweets.append(remove_nonascii(' '.join(t)))

## Generating the RDDs for users

In [13]:
trueNegatives = getTrueNegatives("../twitterApp/twitter.sqlite")
truePositives = getTruePositives("../twitterApp/twitter.sqlite")

In [14]:
alltweets = []

In [15]:
for user in trueNegatives: 
    t = getTweets(user[0]) 
    alltweets.append([user[0], 0, remove_nonascii(' '.join(t))])

for user in truePositives:
    t = getTweets(user[0])
    alltweets.append([user[0], 1, remove_nonascii(' '.join(t))]) 

In [16]:
tweetRDD = sc.parallelize(alltweets).cache()

In [17]:
tweetDF = tweetRDD.toDF(["username","label","tweet"])

In [18]:
tweetDF.printSchema()

root
 |-- username: string (nullable = true)
 |-- label: long (nullable = true)
 |-- tweet: string (nullable = true)



## Computing TF-IDF

In [19]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(tweetDF)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
tfidftweets = idfModel.transform(featurizedData)

In [20]:
tfidftweets.show()

+---------------+-----+--------------------+--------------------+--------------------+--------------------+
|       username|label|               tweet|               words|         rawFeatures|            features|
+---------------+-----+--------------------+--------------------+--------------------+--------------------+
|   PopoffSierra|    0|RT @jacksfilms: "...|[rt, @jacksfilms:...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|      dhenwikan|    0|Carnival and He -...|[carnival, and, h...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
| JosephineAlice|    0|Guess I'll just r...|[guess, i'll, jus...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|   JoseJaimes95|    0|Being able to mut...|[being, able, to,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|      lyriczbot|    0|I miss that happy...|[i, miss, that, h...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|  grillmeeting1|    0|@1HkipUrdKrXXzdk ...|[@1hkipurdkrxxzdk...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|bustercalamity1|    0|@marc

## Normalizing the vectors

In [21]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(tfidftweets)
print("Normalized using L^1 norm")
l1NormData.select("normFeatures").show(truncate=False)

Normalized using L^1 norm
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|normFeatures                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+-----------------------------

## Helper function

In [22]:
from __future__ import division
def measures(tp,tn,fp,fn):
    try: 
        precision = tp/(tp+fp)
    except ZeroDivisionError as e: 
        precision = -1
    
    try:
        recall = tp/(tp+fn)
    except ZeroDivisionError as e: 
        recall = -1   
    
    try:
        specificity = tn/(tn+fp)
    except ZeroDivisionError as e: 
        specificity = -1
    
    try:
        accuracy = (tp+tn)/(tp+tn+fp+fn)
    except ZeroDivisionError as e: 
        accuracy = -1
    
    try:
        f1_score = (2*tp)/((2*tp)+fp+fn)
    except ZeroDivisionError as e: 
        f1_score = -1
    return precision, recall, specificity, accuracy, f1_score

def howgoodisit(result):
    true_positives = result.filter(lambda line: line[1]==0.0 and line[0]==0.0).count()
    true_negatives = result.filter(lambda line: line[1]==1.0 and line[0]==1.0).count()
    false_positives = result.filter(lambda line: line[1]==0.0 and line[0]==1.0).count()
    false_negatives = result.filter(lambda line: line[1]==1.0 and line[0]==0.0).count()
    return measures(true_positives, true_negatives, false_positives, false_negatives)

## Evaluating NormFeatures and TF-IDF features

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [24]:
splits = l1NormData.randomSplit([0.6, 0.4])
training_df = splits[0]
test_df = splits[1]

## Logistic Regression

In [25]:
lr = (LogisticRegression()).setMaxIter(1000).setRegParam(0.01)

lr_pipeline = Pipeline(stages=[lr])

lr_paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.1, 0.5,1.0, 2.0]).build()

lr_cv = CrossValidator(estimator=lr_pipeline, 
                    estimatorParamMaps=lr_paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

## Evaluating TF-IDF features

In [26]:
trainData = training_df.select("label", "features")
testData = test_df.select("label", "features")

In [27]:
lr_cvModel1 = lr_cv.fit(trainData)
lr_result1 = lr_cvModel1.transform(testData)

In [28]:
lr_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator.evaluate(lr_result1, {lr_evaluator.metricName: "accuracy"})

0.6731517509727627

In [29]:
result = lr_result1.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.650793650794
recall:  0.672131147541
specificity:  0.674074074074
accuracy:  0.673151750973
f1_score:  0.661290322581


## Evaluating L1 Norm features

In [30]:
L1trainData = training_df.selectExpr("label", "normFeatures as features")
L1testData = test_df.selectExpr("label", "normFeatures as features")

In [31]:
lr_cvModel2 = lr_cv.fit(L1trainData)
lr_result2 = lr_cvModel2.transform(L1testData)

In [32]:
lr_evaluator_norm = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_norm.evaluate(lr_result2, {lr_evaluator_norm.metricName: "accuracy"})

0.6809338521400778

In [33]:
result = lr_result2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.675438596491
recall:  0.631147540984
specificity:  0.725925925926
accuracy:  0.68093385214
f1_score:  0.652542372881


## Naive Bayes

In [34]:
nb = NaiveBayes()
pipeline = Pipeline(stages=[nb])
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()


cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

## Evaluating TF-IDF Features

In [35]:
nb_model1 = cv.fit(trainData)
nb_result = nb_model1.transform(testData)

In [36]:
nb_evaluator_1 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_1.evaluate(nb_result, {nb_evaluator_1.metricName: "accuracy"})

0.48249027237354086

In [37]:
result = nb_result.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.475982532751
recall:  0.893442622951
specificity:  0.111111111111
accuracy:  0.482490272374
f1_score:  0.621082621083


## Evaluating L1-Norm Features

In [38]:
nb_model2 = cv.fit(L1trainData)
nb_result_2 = nb_model2.transform(L1testData)

In [39]:
nb_evaluator_2 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_2.evaluate(nb_result_2, {nb_evaluator_2.metricName: "accuracy"})

0.47470817120622566

In [40]:
result = nb_result_2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.474708171206
recall:  1.0
specificity:  0.0
accuracy:  0.474708171206
f1_score:  0.643799472296


## Decision Tree 

In [41]:
dt = DecisionTreeClassifier()


dt_pipeline = Pipeline(stages=[dt])
dt_paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1,2,6,10])
             .addGrid(dt.maxBins, [20,40,80])
             .build())

dt_cv = CrossValidator(estimator=dt_pipeline, 
                    estimatorParamMaps=dt_paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

## Evaluating TF-IDF Features

In [42]:
dt_model_1 = dt_cv.fit(trainData)
dt_result_1 = dt_model_1.transform(testData)

In [43]:
dt_evaluator_1 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_1.evaluate(dt_result_1, {dt_evaluator_1.metricName: "accuracy"})

0.6108949416342413

In [44]:
result = dt_result_1.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.677419354839
recall:  0.344262295082
specificity:  0.851851851852
accuracy:  0.610894941634
f1_score:  0.45652173913


## Evaluating L1-Norm features

In [45]:
dt_model_2 = dt_cv.fit(L1trainData)
dt_result_2 = dt_model_2.transform(L1testData)

In [46]:
dt_evaluator_2 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_2.evaluate(dt_result_2, {dt_evaluator_2.metricName: "accuracy"})

0.6303501945525292

In [47]:
result = dt_result_2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.619469026549
recall:  0.573770491803
specificity:  0.681481481481
accuracy:  0.630350194553
f1_score:  0.595744680851


## MLP 

In [48]:
layers = [20, 5, 4, 2]

mlp = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

mlp_pipeline = Pipeline(stages=[mlp])

## Evaluating TF-IDF Features

In [49]:
mlp_model_1 = mlp_pipeline.fit(trainData)
mlp_result_1 = mlp_model_1.transform(testData)

In [50]:
mlp_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator.evaluate(mlp_result_1, {mlp_evaluator.metricName: "accuracy"})

0.669260700389105

In [51]:
result = mlp_result_1.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.803278688525
recall:  0.401639344262
specificity:  0.911111111111
accuracy:  0.669260700389
f1_score:  0.535519125683


## Evaluating L1-Norm Features

In [52]:
mlp_model_2 = mlp_pipeline.fit(L1trainData)
mlp_result_2 = mlp_model_2.transform(L1testData)

In [53]:
mlp_evaluator_2 = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator_2.evaluate(mlp_result_2, {mlp_evaluator_2.metricName: "accuracy"})

0.6264591439688716

In [54]:
result = mlp_result_2.select("label", "prediction").rdd.map(list)

precision, recall, specificity, accuracy, f1_score = howgoodisit(result)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.647727272727
recall:  0.467213114754
specificity:  0.77037037037
accuracy:  0.626459143969
f1_score:  0.542857142857


In [55]:
from sgd import learn_coefficients
import numpy as np

## Converting the different classifier outputs to a numpy array

In [56]:
lr_result1.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|[0.06515744216991...|[0.51628359995859...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[0.54663345557991...|[0.63335417484533...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[0.14942615375561...|[0.53728718467711...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[0.33136720281626...|[0.58209200111472...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[1.70106693253995...|[0.84567403107136...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[3.90337710723941...|[0.98022526136651...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[0.42617525181579...|[0.60495998500858...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-0.2387149948753...|[0.44060304430268...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|[0.96388016388587...|[0.72389800560939...|       0.0|
|    0|(20,[0,1,

In [57]:
nb_result.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|[-715.95208727958...|[0.17736133564222...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|[-1391.4741301063...|[0.99998174046100...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1592.6972640193...|[0.99651028125347...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-996.23488885688...|[0.99927959250167...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1170.2822817213...|[0.99500349050029...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-955.26887608418...|[0.99999797299938...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-227.82468383076...|[0.89728979880548...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-921.53963071971...|[0.97025640607397...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|[-1074.1477689233...|[0.96677171461823...|       0.0|
|    0|(20,[0,1,

In [58]:
dt_result_1.show()

+-----+--------------------+-------------+--------------------+----------+
|label|            features|rawPrediction|         probability|prediction|
+-----+--------------------+-------------+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|  [66.0,80.0]|[0.45205479452054...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|    [5.0,0.0]|           [1.0,0.0]|       0.0|
|    0|(20,[0,1,2,3,4,5,...|   [13.0,0.0]|           [1.0,0.0]|       0.0|
|    0|(20,[0,1,2,3,4,5,...|   [22.0,6.0]|[0.78571428571428...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|  [66.0,80.0]|[0.45205479452054...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|  [66.0,80.0]|[0.45205479452054...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|  [66.0,80.0]|[0.45205479452054...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|  [16.0,48.0]|         [0.25,0.75]|       1.0|
|    0|(20,[0,1,2,3,4,5,...|   [22.0,6.0]|[0.78571428571428...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|  [66.0,80.0]|[0.45205479452054...|       1.0|
|    0|(20,[0,1,2,4,5,6,.

In [59]:
mlp_result_1.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,4,5,6,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       0.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
|    0|(20,[0,1,2,3,4,5,...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



## Linear Combination of Classifiers

In [60]:
label_column = lr_result1.select("label").collect()
lr_column = lr_result1.select("prediction").collect()
mlp_column = mlp_result_1.select("prediction").collect()
nb_column = nb_result.select("prediction").collect()
dt_column = dt_result_1.select("prediction").collect()

In [61]:
temp_label = [i[0] for i in label_column]
temp_lr = [i[0] for i in lr_column]
temp_mlp = [i[0] for i in mlp_column]
temp_nb = [i[0] for i in nb_column]
temp_dt = [i[0] for i in dt_column]

In [62]:
X = np.array([temp_lr,temp_mlp, temp_nb, temp_dt]).transpose()
y = np.array([temp_label]).transpose()

In [63]:
learned_weights = learn_coefficients(X, y, 0.001, 1000)

In [64]:
print "Learned Weights for(without using normalized features):"
print "Logistic Regression", learned_weights[0]/sum(learned_weights)
print "Multilayer Perceptron", learned_weights[1]/sum(learned_weights)
print "Naive Bayes", learned_weights[2]/sum(learned_weights)
print "Decision Tree", learned_weights[3]/sum(learned_weights)

Learned Weights for(without using normalized features):
Logistic Regression -0.992288006554
Multilayer Perceptron 1.36996061338
Naive Bayes 0.0815590240696
Decision Tree 0.540768369103


## Combining the classifier outputs with the help of another classifier

In [65]:
#Using sklearn SVM here as Spark doesn't support SVM with RBF
from sklearn import svm

In [66]:
svm_rbf = svm.SVC()
svm_rbf.fit(X,np.ravel(y))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [67]:
svm_rbf_predictions = svm_rbf.predict(X)

In [68]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y, svm_rbf_predictions).ravel()

precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.655555555556
recall:  0.874074074074
specificity:  0.491803278689
accuracy:  0.692607003891
f1_score:  0.749206349206


## combination of classifiers with logistic regression classifier

In [69]:
from sklearn import linear_model

In [74]:
meta_lr = linear_model.LogisticRegression(solver='lbfgs')
meta_lr.fit(X, np.ravel(y))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [75]:
meta_lr_predictions = meta_lr.predict(X)

In [76]:
tn, fp, fn, tp = confusion_matrix(y, meta_lr_predictions).ravel()

precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)

print "precision: ", precision
print "recall: ", recall
print "specificity: ", specificity
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.655172413793
recall:  0.844444444444
specificity:  0.508196721311
accuracy:  0.684824902724
f1_score:  0.73786407767


In [77]:
meta_learned_weights = meta_lr.coef_

print "Learned Weights for classifiers using logistic regression as the meta-classifier:"
print "Logistic Regression", meta_learned_weights[0][0]/sum(meta_learned_weights[0])
print "Multilayer Perceptron", meta_learned_weights[0][1]/sum(meta_learned_weights[0])
print "Naive Bayes", meta_learned_weights[0][2]/sum(meta_learned_weights[0])
print "Decision Tree", meta_learned_weights[0][3]/sum(meta_learned_weights[0])

Learned Weights for classifiers using logistic regression as the meta-classifier:
Logistic Regression 0.370843180805
Multilayer Perceptron 0.532148078738
Naive Bayes -0.134881083998
Decision Tree 0.231889824455


## Using User Specific Features with the classifiers

In [78]:
def fetch_user_data(username): 
    conn = create_connection("../twitterApp/twitter.sqlite")
    sql = '''select num_status, num_friends, num_followers, isDepressed from users where user_screen_name=?'''
    cur = conn.cursor()
    cur.execute(sql,(username,))
    
    a = cur.fetchall()
    
    return [a[0][0], a[0][1], a[0][2], a[0][3]]

In [83]:
all_users = (trueNegatives + truePositives)

In [84]:
userRDD = sc.parallelize(all_users).map(lambda line: line[0])

In [85]:
userRDD.take(5)

[u'PopoffSierra',
 u'dhenwikan',
 u'JosephineAlice',
 u'JoseJaimes95',
 u'lyriczbot']

In [91]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler

In [89]:
userDataRDD = userRDD.map(lambda line : [str(line), fetch_user_data(str(line))])\
                     .map(lambda line: [line[0], line[1][:3], line[1][-1]])\
                     .map(lambda line: [line[0], Vectors.dense(line[1]), line[2]])
                    

In [90]:
userDataRDD.take(10)

[['PopoffSierra', DenseVector([602.0, 100.0, 22.0]), 0],
 ['dhenwikan', DenseVector([935.0, 241.0, 234.0]), 0],
 ['JosephineAlice', DenseVector([50516.0, 828.0, 2066.0]), 0],
 ['JoseJaimes95', DenseVector([11049.0, 279.0, 360.0]), 0],
 ['lyriczbot', DenseVector([89519.0, 5.0, 20.0]), 0],
 ['grillmeeting1', DenseVector([99.0, 90.0, 4.0]), 0],
 ['bustercalamity1', DenseVector([99.0, 104.0, 5.0]), 0],
 ['_Coopavelli', DenseVector([223653.0, 2309.0, 3683.0]), 0],
 ['LeahEbooks', DenseVector([10578.0, 1.0, 28.0]), 0],
 ['KeyOfConceit', DenseVector([90.0, 206.0, 1571.0]), 0]]

In [92]:
userDataDF = spark.createDataFrame(userDataRDD, ["user","features", "label"])
userDataDF.printSchema()

root
 |-- user: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: long (nullable = true)



In [93]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(userDataDF)
scaledData = scalerModel.transform(userDataDF)
scaledData.show()

+---------------+--------------------+-----+--------------------+
|           user|            features|label|      scaledFeatures|
+---------------+--------------------+-----+--------------------+
|   PopoffSierra|  [602.0,100.0,22.0]|    0|[-0.5511433475228...|
|      dhenwikan| [935.0,241.0,234.0]|    0|[-0.5434584724786...|
| JosephineAlice|[50516.0,828.0,20...|    0|[0.60075711180434...|
|   JoseJaimes95|[11049.0,279.0,36...|    0|[-0.3100505860012...|
|      lyriczbot|  [89519.0,5.0,20.0]|    0|[1.50085675549429...|
|  grillmeeting1|     [99.0,90.0,4.0]|    0|[-0.5627514320490...|
|bustercalamity1|    [99.0,104.0,5.0]|    0|[-0.5627514320490...|
|    _Coopavelli|[223653.0,2309.0,...|    0|[4.59636134762072...|
|     LeahEbooks|  [10578.0,1.0,28.0]|    0|[-0.3209201840367...|
|   KeyOfConceit| [90.0,206.0,1571.0]|    0|[-0.5629591313746...|
| goofy_goober33|      [1.0,25.0,2.0]|    0|[-0.5650130469269...|
|      kalenstar|[28756.0,500.0,79...|    0|[0.09858629810557...|
|        V

## Logistic Regression

In [94]:
userSplits = scaledData.randomSplit([0.6, 0.4])
userTraindf = userSplits[0]
userTestdf = userSplits[1]

In [95]:
userTrainData = userTraindf.select("label", "features")
userTestData = userTestdf.select("label", "features")
userTrainDataScaled = userTraindf.selectExpr("label", "scaledFeatures as features")
userTestDataScaled = userTestdf.selectExpr("label", "scaledFeatures as features")

## Evaluating unscaled features

In [96]:
lr_cvModel_3 = lr_cv.fit(userTrainData)
lr_result_3 = lr_cvModel_3.transform(userTestData)

In [97]:
lr_evaluator_3 = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_3.evaluate(lr_result_3, {lr_evaluator_3.metricName: "accuracy"})

0.5524193548387096

In [98]:
result = lr_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.519607843137
recall:  0.890756302521
specificity:  0.240310077519
accuracy:  0.552419354839
f1_score:  0.656346749226


## Evaluating Scaled features

In [99]:
lr_cvModel_4 = lr_cv.fit(userTrainDataScaled)
lr_result_4 = lr_cvModel_4.transform(userTestDataScaled)

In [100]:
lr_evaluator_4 = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_evaluator_4.evaluate(lr_result_4, {lr_evaluator_4.metricName: "accuracy"})

0.5524193548387096

In [101]:
result = lr_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.519607843137
recall:  0.890756302521
specificity:  0.240310077519
accuracy:  0.552419354839
f1_score:  0.656346749226


## Naive Bayes

## Evaluating unscaled features

In [102]:
nb_model_3 = cv.fit(userTrainData)
nb_result_3 = nb_model_3.transform(userTestData)

In [103]:
nb_evaluator_3 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_3.evaluate(nb_result_3, {nb_evaluator_3.metricName: "accuracy"})

0.6169354838709677

In [104]:
result = nb_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.592307692308
recall:  0.647058823529
specificity:  0.589147286822
accuracy:  0.616935483871
f1_score:  0.618473895582


## Evaluating Scaled features

In [105]:
nb_model_4 = cv.fit(userTrainData)
nb_result_4 = nb_model_4.transform(userTestData)

In [106]:
nb_evaluator_4 = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_evaluator_4.evaluate(nb_result_4, {nb_evaluator_4.metricName: "accuracy"})

0.6169354838709677

In [107]:
result = nb_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.592307692308
recall:  0.647058823529
specificity:  0.589147286822
accuracy:  0.616935483871
f1_score:  0.618473895582


## MLP

In [110]:
layers_2 = [3, 5, 4, 2]

mlp_2 = MultilayerPerceptronClassifier(maxIter=100, layers=layers_2, blockSize=128, seed=1234)

mlp_pipeline_2 = Pipeline(stages=[mlp_2])

## Evaluating unscaled features

In [111]:
mlp_model_3 = mlp_pipeline_2.fit(userTrainData)
mlp_result_3 = mlp_model_3.transform(userTestData)

In [112]:
mlp_evaluator_3 = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator_3.evaluate(mlp_result_3, {mlp_evaluator_3.metricName: "accuracy"})

0.6370967741935484

In [113]:
result = mlp_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.698630136986
recall:  0.428571428571
specificity:  0.829457364341
accuracy:  0.637096774194
f1_score:  0.53125


## Evaluating Scaled features

In [115]:
mlp_model_4 = mlp_pipeline_2.fit(userTrainDataScaled)
mlp_result_4 = mlp_model_4.transform(userTestDataScaled)

In [116]:
mlp_evaluator_4 = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_evaluator_4.evaluate(mlp_result_4, {mlp_evaluator_4.metricName: "accuracy"})

0.6008064516129032

In [117]:
result = mlp_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.578125
recall:  0.621848739496
specificity:  0.581395348837
accuracy:  0.600806451613
f1_score:  0.599190283401


## Decision Tree

## Evaluating unscaled features

In [118]:
dt_model_3 = dt_cv.fit(userTrainData)
dt_result_3 = dt_model_3.transform(userTestData)

In [119]:
dt_evaluator_3 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_3.evaluate(dt_result_3, {dt_evaluator_3.metricName: "accuracy"})

0.6008064516129032

In [120]:
result = dt_result_3.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.583333333333
recall:  0.588235294118
specificity:  0.612403100775
accuracy:  0.600806451613
f1_score:  0.585774058577


## Evaluating scaled features

In [121]:
dt_model_4 = dt_cv.fit(userTrainDataScaled)
dt_result_4 = dt_model_4.transform(userTestDataScaled)

In [122]:
dt_evaluator_4 = MulticlassClassificationEvaluator(predictionCol="prediction")
dt_evaluator_4.evaluate(dt_result_4, {dt_evaluator_4.metricName: "accuracy"})

0.6008064516129032

In [123]:
result = dt_result_4.select("label", "prediction").rdd.map(list)
precision, recall, specificity, accuracy, f1_score = howgoodisit(result)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.583333333333
recall:  0.588235294118
specificity:  0.612403100775
accuracy:  0.600806451613
f1_score:  0.585774058577


## Linear Combination of classifiers through SGD

In [124]:
label_column = lr_result_3.select("label").collect()
lr_column = lr_result_3.select("prediction").collect()
mlp_column = mlp_result_3.select("prediction").collect()
nb_column = nb_result_3.select("prediction").collect()
dt_column = dt_result_3.select("prediction").collect()

In [125]:
temp_label = [i[0] for i in label_column] 
temp_lr = [i[0] for i in lr_column]
temp_mlp = [i[0] for i in mlp_column] 
temp_nb = [i[0] for i in nb_column] 
temp_dt = [i[0] for i in dt_column]

In [126]:
X = np.array([temp_lr,temp_mlp, temp_nb, temp_dt]).transpose()
y = np.array([temp_label]).transpose()

In [130]:
learned_weights = learn_coefficients(X, y, 0.001, 1000)

In [131]:
print "Learned Weights for(without using normalized features):"
print "Logistic Regression", learned_weights[0]/sum(learned_weights) 
print "Multilayer Perceptron", learned_weights[1]/sum(learned_weights) 
print "Naive Bayes", learned_weights[2]/sum(learned_weights)
print "Decision Tree", learned_weights[3]/sum(learned_weights)

Learned Weights for(without using normalized features):
Logistic Regression -2.12687131776
Multilayer Perceptron 1.10054567575
Naive Bayes 1.28698299116
Decision Tree 0.739342650854


## Using SVM Classifier

In [132]:
svm_rbf.fit(X, np.ravel(y))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [133]:
svm_rbf_predictions_2 = svm_rbf.predict(X)

In [138]:
tn, fp, fn, tp = confusion_matrix(y, svm_rbf_predictions_2).ravel()
precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.633720930233
recall:  0.84496124031
specificity:  0.470588235294
accuracy:  0.665322580645
f1_score:  0.724252491694


## Using Logistic Regression

In [139]:
meta_lr_2 = linear_model.LogisticRegression(solver='lbfgs')
meta_lr_2.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [140]:
meta_lr_predictions_2 = meta_lr_2.predict(X)

In [142]:
tn, fp, fn, tp = confusion_matrix(y, meta_lr_predictions_2).ravel()
precision, recall, specificity, accuracy, f1_score = measures(tp,tn,fp,fn)
print "precision: ", precision 
print "recall: ", recall
print "specificity: ", specificity 
print "accuracy: ", accuracy
print "f1_score: ", f1_score

precision:  0.655405405405
recall:  0.751937984496
specificity:  0.571428571429
accuracy:  0.665322580645
f1_score:  0.70036101083


In [143]:
meta_learned_weights_2 = meta_lr_2.coef_

In [144]:
print "Learned Weights for classifiers using logistic regression as the meta-classifier:"
print "Logistic Regression", meta_learned_weights[0][0]/sum(meta_learned_weights[0])
print "Multilayer Perceptron", meta_learned_weights[0][1]/sum(meta_learned_weights[0])
print "Naive Bayes", meta_learned_weights[0][2]/sum(meta_learned_weights[0])
print "Decision Tree", meta_learned_weights[0][3]/sum(meta_learned_weights[0])

Learned Weights for classifiers using logistic regression as the meta-classifier:
Logistic Regression 0.370843180805
Multilayer Perceptron 0.532148078738
Naive Bayes -0.134881083998
Decision Tree 0.231889824455
