In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

import pandas as pd
import numpy as np
from IPython.display import display

import operator
from collections import defaultdict, Counter
from sklearn import metrics

In [2]:
sqlContext.sql('set spark.sql.shuffle.partitions=300')

DataFrame[key: string, value: string]

In [3]:
data_set = sqlContext.sql("select * from ignite.training_set_all")
data_set = data_set.withColumn('double_label', F.when(F.col('candidate_sales_acct_id') == F.col('truth_sales_acct_id'), 1.0).otherwise(0.0))
data_set = data_set.repartition(300)

In [4]:
data_set.groupby('id').agg({'double_label' :  'sum'}).where(F.col('sum(double_label)') > 0).count()

760365

In [5]:
training_ids, testing_ids = data_set.select(['id']).distinct().randomSplit([0.7, 0.3])
training_set = data_set.join(training_ids, on = 'id').cache()
testing_set = data_set.join(testing_ids, on = 'id')

In [6]:
gbt = GBTRegressor(maxIter=100, maxDepth=15, labelCol = 'double_label')#, cacheNodeIds = True)
gbt_model = gbt.fit(training_set)


In [9]:
predicted = gbt_model.transform(testing_set).cache()

In [24]:
gbt_model.save(sc, "gbt-100-15-924.model")

AttributeError: 'GBTRegressionModel' object has no attribute 'save'

In [21]:
predicted.write.saveAsTable('ignite.some_predictions')

In [21]:
predicted.where(F.col('id') == 17179881600).where(F.col('prediction') > 0.01).take(10)

[Row(id=17179881600, candidate_sales_acct_id=203685055.0, truth_sales_acct_id=203785565.0, features=SparseVector(924, {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0, 4: 1.0, 5: 0.0, 6: 1.0, 7: 1.0, 8: 0.0, 9: 1.0, 10: 1.0, 11: 0.0, 12: 1.0, 13: 1.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 27.0, 24: 30.0, 25: 0.0, 26: 27.0, 27: 30.0, 28: 0.0, 29: 27.0, 30: 30.0, 31: 0.0, 32: 27.0, 33: 30.0, 34: 0.0, 35: 0.0, 36: 0.0, 37: 0.0, 38: 0.0, 39: 0.0, 40: 0.0, 41: 3.0, 42: 0.4, 43: 0.2, 44: 0.3333, 45: 0.0, 46: 0.3333, 47: 41.0, 48: 2.3138, 49: 4376.0, 50: 41067.5949, 51: 0.5489, 52: 0.0002, 53: 0.5257, 54: 3.7842, 55: 56.0, 56: 2.1868, 57: 4275.0, 58: 38814.2303, 59: 0.539, 60: 0.0001, 61: 0.3706, 62: 3.5146, 63: 72696.0, 64: 0.1107, 65: 1019.0, 66: 1964.9866, 67: 0.0104, 68: 0.0001, 69: 0.2881, 70: 0.9511, 127: 0.0, 128: 1.0, 129: 4.0, 130: 41.0, 131: 2.3138, 132: 4376.0, 133: 41067.5949, 134: 0.5489, 135: 0.0002, 136: 0.5257, 137: 3.7842, 138: 56.0, 139: 2.1868,

In [10]:
def make_vector(prediction):
    return Vectors.dense([1 - prediction, prediction])

predicted_format = predicted.withColumn('raw_prediction', F.udf(make_vector, VectorUDT())(F.col('prediction')))
BinaryClassificationEvaluator(rawPredictionCol = 'raw_prediction', labelCol = 'double_label').evaluate(predicted_format)

0.993247715536339

In [None]:
predicted_format.take(1)

In [11]:
def k_highest(row_id, candidate_pairs, k = 5):
    candidate_pairs.sort(key = lambda x : x[0], reverse = True)
    num_candidates = len(candidate_pairs)
    
    correct_index = k
    for i, pair in enumerate(candidate_pairs):
        if pair[1] == 1:
            correct_index = i
    
    if correct_index >= k:
        correct_index = k
    
    for i in range(k):
        candidate_pairs.append([0, 0])
        
    elems = []
    for tup in candidate_pairs[:k]:
        elems.append(tup[0])
    
    features = Vectors.dense(elems + [num_candidates])
    label = correct_index
    
    return Row(id = row_id, features = features, label = float(label))

max_ent_final = predicted.map(lambda x : (x.id, [(x.prediction, x.double_label)])).reduceByKey(lambda x, y : x + y).map(lambda x : k_highest(x[0], x[1])).toDF().cache()

In [12]:
label_indexer = StringIndexer(inputCol = 'label', outputCol = 'indexed_label').fit(max_ent_final)
max_ent_indexed = label_indexer.transform(max_ent_final)

In [25]:
max_ent_indexed.where(F.col('label') > 0).where(F.col('label') < 5).take(50)

[Row(features=DenseVector([0.0745, 0.0728, 0.072, 0.0716, 0.0707, 55.0]), id=180388634700, label=2.0, indexed_label=3.0),
 Row(features=DenseVector([0.5331, 0.2128, 0.1804, 0.1681, 0.1057, 18.0]), id=309237647700, label=3.0, indexed_label=4.0),
 Row(features=DenseVector([0.084, 0.084, 0.0814, 0.0767, 0.0677, 73.0]), id=214748381400, label=2.0, indexed_label=3.0),
 Row(features=DenseVector([0.0575, 0.0435, 0.019, 0.019, 0.0114, 324.0]), id=326417521800, label=1.0, indexed_label=2.0),
 Row(features=DenseVector([0.3202, 0.3049, 0.1806, 0.1759, 0.1436, 20.0]), id=352187322600, label=3.0, indexed_label=4.0),
 Row(features=DenseVector([0.1095, 0.1032, 0.0787, 0.0779, 0.0706, 68.0]), id=146028901200, label=4.0, indexed_label=5.0),
 Row(features=DenseVector([0.5728, 0.3371, 0.0116, 0.0036, 0.003, 16.0]), id=171798708900, label=1.0, indexed_label=2.0),
 Row(features=DenseVector([0.4635, 0.107, 0.101, 0.082, 0.0551, 61.0]), id=68719486500, label=1.0, indexed_label=2.0),
 Row(features=DenseVector

In [13]:
final_train, final_test = max_ent_indexed.randomSplit([0.7, 0.3])

In [None]:
final_train.take(1)

In [14]:
output = RandomForestClassifier(featuresCol = 'features', labelCol = 'indexed_label').fit(final_train).transform(final_test).cache()

In [None]:
output.take(5)

In [15]:
output.count()

74894

In [16]:
manual_label = output.where(F.col('prediction') == 1).cache()
manual_label.count()

15161

In [17]:
attempt = output.where(F.col('prediction') != 1)
attempt.where(F.col('prediction') == F.col('indexed_label')).count()

55952

In [18]:
attempt.where(F.col('prediction') != F.col('indexed_label')).count()

3781

In [21]:
15635. / 74601

0.20958164099676949

In [22]:
55094.0 / 74601

0.7385155694963874

In [24]:
3928.0 / 74601

0.05265344968566105

In [20]:
55952.0/74894

0.7470825433279035