In [1]:
from pyspark.sql import functions as F
from pyspark.sql import types
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
from IPython.display import display

from collections import defaultdict, Counter
from sklearn import metrics
from tempfile import NamedTemporaryFile

In [2]:
savm = sqlContext.sql("select * from edso_ignite.contest_savm").repartition(100).cache()
cr = sqlContext.sql("select * from edso_ignite.contest_cr").repartition(100).cache()

In [3]:
#TODO optimize for only 1 select
def apply_function(df, fields, function):
    column_objects = []
    for column in df.columns:
        if column in fields:
            column_objects.append(function(column))
        else:
            column_objects.append(column)
    return df.select(column_objects)

max_date = pd.datetime.today()

#inputs to apply_function
def function_binarize(null_value):
    def inner(column):
        return F.when(F.col(column) == null_value, 0).otherwise(1).alias(column)
    return inner

def function_binarize_nulls():
    return lambda column : F.when(F.col(column).isNull(), 0).otherwise(1).alias(column)

def parse_datetime():
    def inner(column):
        def parse(d):
            if isinstance(d, str):
                return(max_date - datetime.strptime(d, '%Y-%m-%d %H:%M:%S.0')).days
            return d
        return F.udf(parse, types.IntegerType())(F.col(column)).alias(column)
    return inner

def function_replace_empty_strings(value):
    return lambda column : F.when(F.length(F.col(column)) == 0, value).otherwise(F.col(column)).alias(column)

def function_replace_nulls(value):
    return lambda column : F.when(F.col(column).isNull(), value).otherwise(F.col(column)).alias(column)

def function_replace_nans(value):
    return lambda column : F.when(F.col(column) == np.nan, value).otherwise(F.col(column)).alias(column)

#because multidrop in pyspark is ugly
def drop_columns(df, columns):
    return df.select([c for c in df.columns if c not in columns])

class FrequencyThreshold:
    def __init__(self, df, columns):
        self.freq_items = df.freqItems(columns, 0.01)
        
    def freq_threshold(self, column):
        freq_items = self.freq_items.flatMap(lambda x : x[column+"_freqItems"]).collect()
        return F.when(F.col(column).isin(freq_items), F.col(column)).otherwise('OTHER').alias(column)


In [4]:
id_fields = ['party_id', 'parent_party_id']

In [5]:
savm

DataFrame[party_id: double, parent_party_id: double, party_name: string, node_type: string, address1: string, address2: string, address3: string, address4: string, city: string, county: string, state: string, province: string, postal_code: string, postal_code_extn: string, country_code: string, street_name: string, street_number: string, street_direction: string, street_type: string, geo_valid_status: string, completenes_status: string, cleansed_status: string, start_date: string, end_date: string, program_id: double, request_id: double, created_by: double, last_updated_by: double, creation_date: string, last_update_date: string, certified_date: string, site_expl_id: double, conflict_batch_id: double, sa_member_id: double, parent_sa_member_id: double, party_level: double, link_party_id: double, link_party_type: string, split_pct: double, sales_acct_id: double, operation_type: string, account_type: string, account_sub_type: string]

In [21]:
builder = apply_function(savm, ['start_date', 'last_update_date', 'creation_date', 'certified_date'], parse_datetime())
builder.write.saveAsTable('ignite.savm_parsed')

In [7]:
filtered.take(2)

[Row(party_id=1054733.0, parent_party_id=42499.0, party_name=u'PIZZA HUT, INC.', node_type=u'BR', address1=u'1821 SAN MARCO RD', address2=u'', address3=u'', address4=u'', city=u'MARCO ISLAND', county=u'COLLIER', state=u'FL', province=None, postal_code=u'34145', postal_code_extn=u'6722', country_code=u'US', street_name=u'SAN MARCO', street_number=u'1821', street_direction=None, street_type=u'RD', geo_valid_status=u'GEO_VALID', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2002-11-12 05:01:15.0', end_date=u'4712-12-31 00:00:00.0', program_id=44449.0, request_id=39462081.0, created_by=1116.0, last_updated_by=305153.0, creation_date=u'2002-11-12 05:01:15.0', last_update_date=u'2010-01-13 22:48:41.0', certified_date=u'2010-01-13 22:48:41.0', site_expl_id=479700768.0, conflict_batch_id=87615930.0, sa_member_id=3755080.0, parent_sa_member_id=-1.0, party_level=2.0, link_party_id=42499.0, link_party_type=u'ORGANIZATION', split_pct=50.0, sales_acct_id=252629163.0, ope

In [8]:
def add_sparse(vectors):
    values = defaultdict(float) # Dictionary with default value 0.0
    # Add values from v1
    for v in vectors:
        for i in range(v.indices.size):
            values[v.indices[i]] += v.values[i]
    return Vectors.sparse(vectors[0].size, dict(values))

def hstack_sparse(sparse_vectors):
    values = {}
    index = 0
    for vector in sparse_vectors:
        for i in range(vector.indices.shape[0]):
            values[vector.indices[i] + index] = vector.values[i]
        index += vector.size
    return Vectors.sparse(index, values)

def list_to_sparse(dense):
    values = {}
    for i, v in enumerate(dense):
        values[i] = v
    return SparseVector(len(dense), values)

def pad_list(l, length, val = 0):
    return l, length, val
    return l + [val] * (length - len(l))

In [18]:
def featurizer(result_iter):
    rows = [row for row in result_iter]
    
    party_hq_counter = Counter()
    
    party_name_vectors = []
    party_names = []
    address_vectors = []
    party_ids = []
    for row in rows:
        party_ids.append(row.party_id)
        party_names.append(row.party_name)
        party_hq_counter[row.parent_party_id] += 1
        party_name_vectors.append(row.party_vector)
        address_vectors.append(row.address_vector)
    
    return Row(sales_acct_id = rows[0].sales_acct_id,
               party_ids = party_ids,
               party_hq_counter = party_hq_counter,
               party_name_vector = add_sparse(party_name_vectors),
               address_vector = add_sparse(address_vectors),
               num_parties = len(rows))

In [20]:
filtered.where(F.col('sales_acct_id') == 203700707.0).map(lambda x : (x.sales_acct_id, x)).groupByKey().mapValues(featurizer) \
                    .saveAsPickleFile('savm_vectorized_4.pkl')