In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
from IPython.display import display

import operator
from collections import defaultdict, Counter
from sklearn import metrics

In [4]:
topic_modeling_words = sqlContext.sql('select * from ignite.topic_modeling_per_word').cache()
topic_modeling_savm = sqlContext.sql('select * from ignite.topic_modeling_savm_tfidf').repartition(1000).cache()
savm_parsed = sqlContext.sql('select * from ignite.savm_parsed').cache()
cr_parsed = sqlContext.sql('select * from ignite.cr_parsed').cache()
party_expansion = sqlContext.sql('select * from ignite.party_expansion').cache()

In [22]:
sqlContext.sql('select * from ignite.some_predictions').take(10)

[Row(id=334, candidate_sales_acct_id=253363314.0, truth_sales_acct_id=203688374.0, features=SparseVector(882, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 1.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0, 37: 0.0, 38: 0.0, 39: 1.0, 40: 0.0, 41: 1.0, 42: 0.4, 43: 0.4, 44: 0.0, 45: 0.4, 46: 0.0, 47: 62.0, 48: 1.891, 49: 147.0, 50: 1319.9525, 51: 0.3794, 52: 0.0, 53: 0.8409, 54: 4.4896, 55: 19782.0, 56: 0.6756, 57: 146.0, 58: 471.5545, 59: 0.0194, 60: 0.0, 61: 0.4532, 62: 1.5447, 127: 0.0, 128: 1.0, 129: 3.0, 877: 0.0, 878: 0.0, 879: 29.0, 880: 1.0, 881: 100.0}), double_label=0.0, prediction=0.0004913800862519566),
 Row(id=334, candidate_sales_acct_id=254694421.0, truth_sales_acct_id=203688374.0, features=SparseVector(882, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 

In [68]:
sqlContext.sql('select * from ignite.lsh_savm_candidates_2')

DataFrame[id: bigint, party_id_candidate: double, candidate_sales_acct_id: double]

In [3]:
local_topic_modeling_words = topic_modeling_words.collect()
topic_modeling_dict = {}
topic_modeling_dict_by_index = {}
for row in local_topic_modeling_words:
    topic_modeling_dict[row.word] = row
    topic_modeling_dict_by_index[row['index']] = row
    
topic_modeling_words_broadcast = sc.broadcast(topic_modeling_dict)
topic_modeling_words_by_index_broadcast = sc.broadcast(topic_modeling_dict_by_index)

In [123]:
topic_modeling_words_broadcast.value['company']

Row(index=13, norm_max=0.017447402292281856, min_tfidf=2.692710480765196e-05, avg_tfidf=0.2625945258429515, max_tfidf=1.315892991386627, count_docs=64566, word=u'company')

In [4]:
def term_buckets(row):
    bigrams = []
    for i in range(len(row.tokenized_name) - 1):
        bigrams.append((' '.join(row.tokenized_name[i : i + 2]), row.party_id))
        
    return bigrams

savm_bucketed = savm_parsed.flatMap(term_buckets).toDF(['bigram', 'savm_party_id']).cache()
cr_bucketed = cr_parsed.flatMap(term_buckets).toDF(['bigram', 'cr_party_id']).cache()

In [5]:
frequent_bigrams = savm_bucketed.map(lambda x : (x.bigram, 1)).reduceByKey(lambda x, y : x + y).filter(lambda x : x[1] > 1000).toDF(['bigram', 'frequency']).withColumn('too_frequent', F.lit(1))
savm_filtered = savm_bucketed.join(frequent_bigrams, on = 'bigram', how = 'left').where(F.col('too_frequent').isNull()).cache()

In [6]:
bigram_sales_accts = savm_filtered.join(savm_parsed.select(['party_id', 'sales_acct_id']), on = F.col('party_id') == F.col('savm_party_id')).drop_duplicates(['bigram', 'sales_acct_id']).cache()

In [24]:
important_bigrams = bigram_sales_accts.groupby('bigram').agg({'sales_acct_id' : 'count'})
important_bigrams = important_bigrams.where(F.col('count(sales_acct_id)') < 20).join(bigram_sales_accts, how = 'left', on = 'bigram').cache()

In [22]:
frequent_cr_bigrams = cr_bucketed.map(lambda x : (x.bigram, 1)).reduceByKey(lambda x, y : x + y).filter(lambda x : x[1] > 1000).toDF(['bigram', 'frequency']).withColumn('too_frequent', F.lit(1))
cr_filtered = cr_bucketed.join(frequent_cr_bigrams, on = 'bigram', how = 'left').where(F.col('too_frequent').isNull()).cache()

In [18]:
savm_filtered.take(10)

[Row(bigram=u'#61 greater', savm_party_id=6691078.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=41534303.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=165481275.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=89082249.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=116331625.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=41512045.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=153764040.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=184345977.0, frequency=None, too_frequent=None),
 Row(bigram=u'& academies', savm_party_id=41409629.0, frequency=None, too_frequent=None),
 Row(bigram=u'& assocics', savm_party_id=41616010.0, frequency=None, too_frequent=None)]

In [24]:
cr_filtered.take(10)

[Row(bigram=u'\x08 res', cr_party_id=92266791.0, frequency=None, too_frequent=None),
 Row(bigram=u'\x08 res', cr_party_id=92216265.0, frequency=None, too_frequent=None),
 Row(bigram=u'!\x1a\x18\x19 \x14', cr_party_id=123390637.0, frequency=None, too_frequent=None),
 Row(bigram=u'!\x1a\x18\x19 \x14', cr_party_id=123390816.0, frequency=None, too_frequent=None),
 Row(bigram=u'"am berg"', cr_party_id=39702900.0, frequency=None, too_frequent=None),
 Row(bigram=u'"continent express"', cr_party_id=36399634.0, frequency=None, too_frequent=None),
 Row(bigram=u'"expolab" fortschrittliche', cr_party_id=11211646.0, frequency=None, too_frequent=None),
 Row(bigram=u'"imc322reaseed" units', cr_party_id=32171322.0, frequency=None, too_frequent=None),
 Row(bigram=u'"jb pearl', cr_party_id=124943262.0, frequency=None, too_frequent=None),
 Row(bigram=u'"kamleitner &', cr_party_id=110358871.0, frequency=None, too_frequent=None)]

In [26]:
cr_filtered.join(savm_filtered, on = 'bigram').select(['cr_party_id', 'savm_party_id']).drop_duplicates().write.saveAsTable('ignite.bigram_candidate_gen')

In [20]:
#bigram_candidates = sqlContext.sql('select * from ignite.bigram_candidate_gen')
party_expansion = sqlContext.sql('select * from ignite.party_expansion')

In [21]:
def contest_term_buckets(row):
    bigrams = []
    for name in [row.prior_party_name, row.end_customer_line_fix]:
        tokenized = name.split(" ")
        for i in range(len(tokenized) - 1):
            bigrams.append((' '.join(tokenized[i : i + 2]), row.id))
        
    return bigrams

party_bigrams = party_expansion.flatMap(contest_term_buckets).toDF(['bigram', 'id']).drop_duplicates().cache()
frequent_bigrams = party_bigrams.map(lambda x : (x.bigram, 1)).reduceByKey(lambda x, y : x + y).filter(lambda x : x[1] > 1000).toDF(['bigram', 'frequency']).withColumn('too_frequent', F.lit(1))
party_filtered = party_bigrams.join(frequent_bigrams, on = 'bigram', how = 'left').where(F.col('too_frequent').isNull()).cache()

In [36]:
party_filtered.take(5)

[Row(bigram=u'& finance', id=403726926581, frequency=None, too_frequent=None),
 Row(bigram=u'& finance', id=171798720404, frequency=None, too_frequent=None),
 Row(bigram=u'& finance', id=403726926568, frequency=None, too_frequent=None),
 Row(bigram=u'& finance', id=403726926567, frequency=None, too_frequent=None),
 Row(bigram=u'& finance', id=403726926551, frequency=None, too_frequent=None)]

In [69]:
bigram_parties = party_filtered.join(important_bigrams, on = 'bigram', how = 'left').select(['id', 'bigram', 'savm_party_id']).where(~F.col('savm_party_id').isNull()).cache()

In [70]:
bigram_candidates = bigram_parties.join(savm_parsed.select(['party_id', F.col('sales_acct_id').alias('candidate_sales_acct_id')]), how = 'left', on = (F.col('savm_party_id') == F.col('party_id'))).drop('party_id')

In [78]:
bigram_candidates.select(['id', F.col('savm_party_id').alias('candidate_party_id'), 'candidate_sales_acct_id']).drop_duplicates().write.saveAsTable('ignite.bigram_candidates', mode = 'overwrite')

In [59]:
correct_per_bigram = builder.join(party_expansion.select(['id', F.col('sales_acct_id').alias('truth_sales_acct_id')]), on = 'id', how = 'left').where(F.col('candidate_sales_acct_id') == F.col('truth_sales_acct_id')).cache()

In [60]:
correct_per_bigram.drop_duplicates(['id', 'candidate_sales_acct_id']).count()

494911

In [90]:
lsh_candidates = sqlContext.sql('select * from ignite.lsh_savm_candidates_2').withColumn('candidate_party_id', F.col('party_id_candidate'))
direct_candidates = sqlContext.sql('select * from ignite.direct_id_candidates').withColumn('candidate_party_id', F.col('candidate_party'))
parent_expansion_candidates = sqlContext.sql('select * from ignite.parent_expansion_candidates').withColumn('candidate_party_id', F.col('candidate_party'))
columns = ['id', 'candidate_sales_acct_id', 'candidate_party_id']
candidate_gen = lsh_candidates.select(columns).unionAll(direct_candidates.select(columns)).unionAll(sqlContext.sql('select * from ignite.bigram_candidates').select(columns))
candidate_gen = candidate_gen.unionAll(parent_expansion_candidates.select(columns)).drop_duplicates().cache()

In [91]:
candidate_gen.count()

147515462

In [92]:
candidate_gen

DataFrame[id: bigint, candidate_sales_acct_id: double, candidate_party_id: double]

In [95]:
have_candidate = candidate_gen.select(['id', 'candidate_sales_acct_id']).join(party_expansion.select(['id', F.col('sales_acct_id').alias('truth_sales_acct_id')]), on = 'id').where(F.col('candidate_sales_acct_id') == F.col('truth_sales_acct_id')).drop_duplicates().cache()

In [97]:
party_expansion.select(['id']).join(have_candidate.select(['id']).withColumn('have_candidate', F.lit(1)), on = 'id', how = 'left').where(F.col('have_candidate').isNull()).take(100)

[Row(id=231, have_candidate=None),
 Row(id=3431, have_candidate=None),
 Row(id=4431, have_candidate=None),
 Row(id=6231, have_candidate=None),
 Row(id=6431, have_candidate=None),
 Row(id=8431, have_candidate=None),
 Row(id=8589934621, have_candidate=None),
 Row(id=8589935421, have_candidate=None),
 Row(id=8589940021, have_candidate=None),
 Row(id=8589942021, have_candidate=None),
 Row(id=8589944621, have_candidate=None),
 Row(id=8589945821, have_candidate=None),
 Row(id=17179871611, have_candidate=None),
 Row(id=17179873011, have_candidate=None),
 Row(id=17179874211, have_candidate=None),
 Row(id=17179884611, have_candidate=None),
 Row(id=17179885411, have_candidate=None),
 Row(id=25769804801, have_candidate=None),
 Row(id=25769809201, have_candidate=None),
 Row(id=25769810401, have_candidate=None),
 Row(id=25769810601, have_candidate=None),
 Row(id=25769812801, have_candidate=None),
 Row(id=25769816201, have_candidate=None),
 Row(id=25769817801, have_candidate=None),
 Row(id=257698200

In [103]:
party_expansion.where(F.col('id') == 6231).take(10)

[Row(sold_to_customer_key=62192337, bill_to_customer_key=62207957, ship_to_customer_key=65955721, end_customer_key_line_level=66000140, end_customer_key_order_level=66000140, sales_order_line_key=234635860, sales_order_key=23778618, endcustomerlinefixed=u'QCC-CPEAAS', pppk=165769531, fppk=157560343, decision_date_time=datetime.date(2015, 7, 3), days_from_create_to_final_decision=37.708333, end_customer_party_ssot_party_id_int_sav_party_id=247695596, prior_party_ssot_party_id_int_sav_party_id=247695596, final_party_ssot_party_id_int_sav_party_id=234729982, type=u'Reviewed_and_modified', priorpartyname=u'QCC-CPEAAS', finalpartyname=u'ADVANCED DISPOSAL', so_number_int=100655340, dd_end_customer_type_order_level=u'ACCOUNT_LOCATION                                  ', purchase_order_type_code=u'Resale', purchase_order_number=u'8262642', order_datetime=u'2015-05-26 12:50:11.0', oracle_book_datetime=u'2015-06-05 15:39:17.0', transactional_currency_code=u'USD            ', customer_service_rep_