In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

from datetime import datetime
import pandas as pd
import numpy as np

In [4]:
sqlContext.sql("set spark.sql.shuffle.partitions = 100")

DataFrame[key: string, value: string]

In [3]:
sqlContext.sql("show tables in ignite").collect()

[Row(tableName=u'all_hashed', isTemporary=False),
 Row(tableName=u'contest_cr_compressed', isTemporary=False),
 Row(tableName=u'contest_data_sample', isTemporary=False),
 Row(tableName=u'contest_data_stamped', isTemporary=False),
 Row(tableName=u'contest_savm_compressed', isTemporary=False),
 Row(tableName=u'contest_sol_end_customer_compressed', isTemporary=False),
 Row(tableName=u'cr_hashed', isTemporary=False),
 Row(tableName=u'lsh_pairs', isTemporary=False),
 Row(tableName=u'lsh_savm_candidates', isTemporary=False),
 Row(tableName=u'savm_candidate_gen', isTemporary=False),
 Row(tableName=u'savm_hashed', isTemporary=False),
 Row(tableName=u'string_to_string', isTemporary=False),
 Row(tableName=u'test', isTemporary=False)]

In [6]:
contest_data = sqlContext.sql("select * from ignite.contest_data_stamped").cache()
so_end_customer = sqlContext.sql('select * from edso_ignite.contest_so_end_customer').repartition(200).cache()
sol_end_customer = sqlContext.sql('select * from edso_ignite.contest_sol_end_customer').repartition(200).cache()
party_info = sqlContext.sql("select * from edso_ignite.contest_party_info").repartition(200).cache()
end_customer = sqlContext.sql('select * from edso_ignite.contest_end_customer').repartition(200).cache()
cr_parsed = sqlContext.sql('select * from ignite.cr_parsed').repartition(200).cache()
savm_parsed = sqlContext.sql("select * from ignite.savm_parsed").repartition(200, 'party_id').cache()

In [4]:
contest_data.take(1)

[Row(endcustomerlinefixed=u'CALPINE', pppk=113405770, fppk=677259, decision_date_time=u'2015-07-03 05:48:05.0', days_from_create_to_final_decision=20.208333, end_customer_party_ssot_party_id_int_sav_party_id=184948898, prior_party_ssot_party_id_int_sav_party_id=184948898, final_party_ssot_party_id_int_sav_party_id=2649467, type=u'Reviewed_and_modified', priorpartyname=u'CALPINE', finalpartyname=u'CALPINE CORPORATION', sales_order_key=23867682, sales_order_line_key=235827955, so_number_int=100729935, end_customer_key_line_level=52302581, end_customer_key_order_level=14654135, dd_end_customer_type_order_level=u'ACCOUNT_LOCATION                                  ', purchase_order_type_code=u'Resale', purchase_order_number=u'38504904-01', order_datetime=u'2015-06-13 00:05:58.0', oracle_book_datetime=u'2015-06-12 17:38:00.0', transactional_currency_code=u'USD            ', customer_service_rep_name=u'null', cust_svc_rep_team_name=u'null', conversion_type_code=u'Corporate', original_system_re

In [5]:
pppk_parties = party_info.select([
        F.col('party_key').alias('pppk_party_key'), F.col('party_ssot_party_id_int').alias('pppk_party')
    ])

def drop_columns(df, columns):
    return df.select([c for c in df.columns if c not in columns])

def alias_end_customer(prefix, key_alias = 'end_customer_key'):
    return end_customer.select([
            F.col('end_customer_key').alias(key_alias),
            F.col('branch_party_ssot_party_id_int').alias(prefix + '_branch_party'),
            F.col('gu_party_ssot_party_id_int').alias(prefix + '_gu_party'),
            F.col('hq_party_ssot_party_id_int').alias(prefix + '_hq_party')
        ])

def apply_function(df, fields, function):
    column_objects = []
    for column in df.columns:
        if column in fields:
            column_objects.append(function(column))
        else:
            column_objects.append(column)
    return df.select(column_objects)

max_date = pd.datetime.today()

def parse_datetime(column):
    return F.udf(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S.0'), DateType())(F.col(column)).alias(column)

def clean_text(name):
    name = name.replace("-", " ").replace(",", " ").replace("-", " ").replace("/", " ").replace("(", " ").replace(")", " ")
    name = name.replace('. ', " ").lower()
    return ' '.join(name.split())

def udf_clean_test(column):
    return F.udf(lambda x : clean_text(x), StringType())(F.col(column)).alias(column)


so_party = so_end_customer.select([
        'sales_order_key', F.col('end_customer_key').alias('end_customer_key_so_party')
    ]).join(alias_end_customer('so', 'end_customer_key_so_party'), on = 'end_customer_key_so_party')

sol_party = sol_end_customer.select([
        'sales_order_line_key', F.col('end_customer_key').alias('end_customer_key_sol_party')
    ]).join(alias_end_customer('sol', 'end_customer_key_sol_party'), on = 'end_customer_key_sol_party')

builder = apply_function(contest_data, ['decision_date_time'], parse_datetime)

builder = builder.withColumn('end_customer_line_fix', udf_clean_test('endcustomerlinefixed'))
builder = builder.withColumn('prior_party_name', udf_clean_test('priorpartyname'))

party_joined = builder.join(pppk_parties, on = F.col('pppk_party_key') == F.col('pppk'), how = 'left') \
                    .join(so_party, on = 'sales_order_key', how = 'left') \
                    .join(sol_party, on = 'sales_order_line_key', how = 'left') \
                    .join(alias_end_customer('order_level', 'end_customer_key_order_level'), on = 'end_customer_key_order_level', how = 'left') \
                    .join(alias_end_customer('line_level', 'end_customer_key_line_level'), on = 'end_customer_key_line_level', how = 'left') \
                    
party_joined = party_joined.join(alias_end_customer('ship_to', 'ship_to_customer_key'), on = 'ship_to_customer_key', how = 'left') \
                    .join(alias_end_customer('bill_to', 'bill_to_customer_key'), on = 'bill_to_customer_key', how = 'left') \
                    .join(alias_end_customer('sold_to', 'sold_to_customer_key'), on = 'sold_to_customer_key', how = 'left')

In [6]:
party_joined = party_joined.cache()

In [7]:
party_values = [
    'end_customer_party_ssot_party_id_int_sav_party_id',
    'prior_party_ssot_party_id_int_sav_party_id',
    'sol_branch_party',
    'sol_gu_party',
    'sol_hq_party',
    'order_level_branch_party',
    'order_level_gu_party',
    'order_level_hq_party',
    'line_level_branch_party',
    'line_level_gu_party',
    'line_level_hq_party',
    'ship_to_branch_party',
    'ship_to_gu_party',
    'ship_to_hq_party',
    'bill_to_branch_party',
    'bill_to_gu_party',
    'bill_to_hq_party',
    'sold_to_branch_party',
    'sold_to_gu_party',
    'sold_to_hq_party'
]

In [21]:
party_joined.write.saveAsTable('ignite.party_expansion')

In [8]:
party_joined = sqlContext.sql('drop table ignite.party_expansion')

In [4]:
party_expansion = sqlContext.sql('select * from ignite.party_expansion')

In [5]:
party_expansion.groupby('sales_acct_id').agg({'id' : 'count'}).take(100)

[Row(sales_acct_id=203781069.0, count(id)=5),
 Row(sales_acct_id=203739089.0, count(id)=9),
 Row(sales_acct_id=203811794.0, count(id)=53),
 Row(sales_acct_id=203901909.0, count(id)=74),
 Row(sales_acct_id=203686898.0, count(id)=2),
 Row(sales_acct_id=207742953.0, count(id)=3),
 Row(sales_acct_id=203719619.0, count(id)=902),
 Row(sales_acct_id=203897828.0, count(id)=16),
 Row(sales_acct_id=203778995.0, count(id)=44),
 Row(sales_acct_id=203901870.0, count(id)=4),
 Row(sales_acct_id=203851661.0, count(id)=80),
 Row(sales_acct_id=203706304.0, count(id)=117),
 Row(sales_acct_id=203779034.0, count(id)=2),
 Row(sales_acct_id=203795336.0, count(id)=5),
 Row(sales_acct_id=264618915.0, count(id)=12),
 Row(sales_acct_id=203860881.0, count(id)=6),
 Row(sales_acct_id=203799431.0, count(id)=36),
 Row(sales_acct_id=203729830.0, count(id)=30),
 Row(sales_acct_id=208240541.0, count(id)=3),
 Row(sales_acct_id=203774953.0, count(id)=365),
 Row(sales_acct_id=203885543.0, count(id)=1),
 Row(sales_acct_id=2

In [7]:
savm_parsed.where(F.col('sales_acct_id') == 203719619.0).take(10)

[Row(party_id=42480121.0, parent_party_id=4235570.0, party_name=u'AGILENT TECHNOLOGIES', node_type=u'BR', address1=u'5301 EL CAMINO REAL', address2=None, address3=None, address4=None, city=u'SANTA CLARA', county=u'SANTA CLARA', state=u'CA', province=None, postal_code=u'95053', postal_code_extn=None, country_code=u'US', street_name=u'EL CAMINO REAL', street_number=u'5301', street_direction=None, street_type=None, geo_valid_status=u'GEO_VALID', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2007-01-16 05:37:20.0', end_date=u'4712-12-31 00:00:01.0', program_id=None, request_id=None, created_by=307954.0, last_updated_by=307954.0, creation_date=datetime.date(2007, 1, 16), last_update_date=datetime.date(2007, 1, 16), certified_date=u'2007-01-16 05:37:20.0', site_expl_id=441380052.0, conflict_batch_id=83453461.0, sa_member_id=470534.0, parent_sa_member_id=6088.0, party_level=3.0, link_party_id=203666453.0, link_party_type=u'TEMPLATE', split_pct=100.0, sales_acct_id=