In [1]:
from numpy import array
from time import time
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.tree import GradientBoostedTrees

In [2]:
def parse_interaction(line):
  print line
  line_split = line.split(",")
  return LabeledPoint(line_split[1], line_split[2:])

In [3]:
input_data = sc.textFile("dbfs:/FileStore/tables/7nd94uar1487386945609/merged.csv")
header = input_data.first()
train_data = input_data.filter(lambda x: x != header).map(lambda x: LabeledPoint(x.split(',')[1], x.split(',')[2:])).cache()
test_input_data = sc.textFile("dbfs:/FileStore/tables/7nd94uar1487386945609/test.csv")
test_data = test_input_data.filter(lambda x: x != header).map(lambda x: LabeledPoint(x.split(',')[1], x.split(',')[2:])).cache()

In [4]:
logistic_regression_results = []

for _ in range(10):
  t0 = time()
  logistic_model = LogisticRegressionWithLBFGS.train(train_data)
  trt = time() - t0

  labels_and_preds = test_data.map(lambda p: (p.label, logistic_model.predict(p.features)))
  t0 = time()
  fp = float(labels_and_preds.filter(lambda (v, p): p == 1 and v == 0).count())
  fn = float(labels_and_preds.filter(lambda (v, p): p == 0 and v == 1).count())
  tp = float(labels_and_preds.filter(lambda (v, p): p == 1 and v == 1).count())
  tn = float(labels_and_preds.filter(lambda (v, p): p == 0 and v == 0).count())

  test_accuracy = (tp + tn) / (tp + tn + fp + fn)
  test_recall = tp / (tp + fn)
  test_precision = tp / (fp + tp)
  test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall)
  tst = time() - t0
  logistic_regression_results.append((round(trt,3), round(tst,3), round(test_accuracy,4), round(test_recall,4), round(test_precision,4), round(test_f1,4)))

In [5]:
random_forest_results = []

for _ in range(10):
  t0 = time()
  rf_model = RandomForest.trainClassifier(train_data, 2, categoricalFeaturesInfo={40:2, 41:2, 42:2, 43:2, 44:2, 45:2, 46:2, 47:2, 48:2, 49:2}, numTrees=200, featureSubsetStrategy="auto", impurity='gini', maxDepth=10, maxBins=100)
  trt = time() - t0
  
  predictions = rf_model.predict(test_data.map(lambda x: x.features))
  labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
  
  t0 = time()
  fp = float(labelsAndPredictions.filter(lambda (v, p): p == 1 and v == 0).count())
  fn = float(labelsAndPredictions.filter(lambda (v, p): p == 0 and v == 1).count())
  tp = float(labelsAndPredictions.filter(lambda (v, p): p == 1 and v == 1).count())
  tn = float(labelsAndPredictions.filter(lambda (v, p): p == 0 and v == 0).count())

  test_accuracy = (tp + tn) / (tp + tn + fp + fn)
  test_recall = tp / (tp + fn)
  test_precision = tp / (fp + tp)
  test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall)
  tst = time() - t0
  random_forest_results.append((round(trt,3), round(tst,3), round(test_accuracy,4), round(test_recall,4), round(test_precision,4), round(test_f1,4)))

In [6]:
gradient_boosted_trees_results = []
for _ in range(10):
  t0 = time()
  gbt_model = GradientBoostedTrees.trainClassifier(train_data, categoricalFeaturesInfo={40:2, 41:2, 42:2, 43:2, 44:2, 45:2, 46:2, 47:2, 48:2, 49:2}, numIterations=3)
  trt = time() - t0
  
  predictions = gbt_model.predict(test_data.map(lambda x: x.features))
  labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
  
  t0 = time()
  fp = float(labelsAndPredictions.filter(lambda (v, p): p == 1 and v == 0).count())
  fn = float(labelsAndPredictions.filter(lambda (v, p): p == 0 and v == 1).count())
  tp = float(labelsAndPredictions.filter(lambda (v, p): p == 1 and v == 1).count())
  tn = float(labelsAndPredictions.filter(lambda (v, p): p == 0 and v == 0).count())

  test_accuracy = (tp + tn) / (tp + tn + fp + fn)
  test_recall = tp / (tp + fn)
  test_precision = tp / (fp + tp)
  test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall)
  tst = time() - t0
  gradient_boosted_trees_results.append((round(trt,3), round(tst,3), round(test_accuracy,4), round(test_recall,4), round(test_precision,4), round(test_f1,4)))

In [7]:
print display(logistic_regression_results)

In [8]:
print display(random_forest_results)

In [9]:
print display(gradient_boosted_trees_results)