In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
from IPython.display import display

import operator
from collections import defaultdict, Counter
from sklearn import metrics

In [3]:
sqlContext.sql("set spark.sql.shuffle.partitions=5000")

DataFrame[key: string, value: string]

In [3]:
party_values = [
    'end_customer_party_ssot_party_id_int_sav_party_id',
    'prior_party_ssot_party_id_int_sav_party_id',
    'sol_branch_party',
    'sol_gu_party',
    'sol_hq_party',
    'order_level_branch_party',
    'order_level_gu_party',
    'order_level_hq_party',
    'line_level_branch_party',
    'line_level_gu_party',
    'line_level_hq_party',
    'ship_to_branch_party',
    'ship_to_gu_party',
    'ship_to_hq_party',
    'bill_to_branch_party',
    'bill_to_gu_party',
    'bill_to_hq_party',
    'sold_to_branch_party',
    'sold_to_gu_party',
    'sold_to_hq_party'
]

address_fields = [
    'address1', 'address2', 'address3', 'address4',
    'city', 'county', 'state', 'postal_code', 'street_name',
    'street_number', 'street_direction', 'street_type'
]


In [4]:
savm_parsed = sqlContext.sql("select * from ignite.savm_parsed").repartition(100).cache()
#temp because hadoop disk can't keep up...
party_expansion = sqlContext.sql("select * from ignite.party_expansion_temp").select(party_values +
        [
            'id', 'sales_acct_id', 'prior_party_name', 'end_customer_line_fix'
        ]).cache()

In [5]:
party_expansion.take(1)

[Row(end_customer_party_ssot_party_id_int_sav_party_id=217501313, prior_party_ssot_party_id_int_sav_party_id=217501313, sol_branch_party=217501313, sol_gu_party=2346701, sol_hq_party=2346701, order_level_branch_party=217501313, order_level_gu_party=2346701, order_level_hq_party=2346701, line_level_branch_party=217501313, line_level_gu_party=2346701, line_level_hq_party=2346701, ship_to_branch_party=217501313, ship_to_gu_party=2346701, ship_to_hq_party=2346701, bill_to_branch_party=128171, bill_to_gu_party=128171, bill_to_hq_party=128171, sold_to_branch_party=128171, sold_to_gu_party=128171, sold_to_hq_party=128171, id=68719483849, sales_acct_id=203728339.0, prior_party_name=u'j2 global communications', end_customer_line_fix=u'j2 global communications')]

In [8]:
#id_candidate_gen = sqlContext.sql('select * from ignite.direct_id_candidates').unionAll(sqlContext.sql('select * from ignite.parent_expansion_candidates')).repartition(1000).drop_duplicates().cache()
direct_candidate_gen = sqlContext.sql('select * from ignite.direct_id_candidates')
lsh_candidate_gen = sqlContext.sql('select * from ignite.lsh_savm_candidates_2').cache()
candidate_match_status = sqlContext.sql('select * from ignite.candidate_match_status').cache()
topic_modeling_savm = sqlContext.sql('select * from ignite.topic_modeling_savm_tfidf').repartition(1000).cache()
topic_modeling_words = sqlContext.sql('select * from ignite.topic_modeling_per_word').cache()
cr_parsed = sqlContext.sql("select * from ignite.cr_parsed").repartition(500).cache()

In [9]:
# thin out whatever we can
def drop_columns(df, columns):
    return df.select([c for c in df.columns if c not in columns])

savm_parsed = drop_columns(savm_parsed, ['geo_valid_status', 'completenes_status', 'cleansed_status', 'start_date', 'end_date',
                          'program_id', 'request_id', 'created_by', 'last_updated_by', 'creation_date', 'last_update_date',
                          'certified_date', 'site_expl_id', 'conflict_batch_id', 'sa_member_id', 'parent_sa_member_id',
                          'link_party_type', 'account_type', 'operation_type'
                    ])
cr_parsed = drop_columns(cr_parsed, ['geo_valid_status', 'completenes_status', 'cleansed_status', 'start_date', 'end_date',
                          'program_id', 'request_id', 'created_by', 'last_updated_by', 'creation_date', 'last_update_date',
                          'certified_date'
                    ])


In [8]:
local_topic_modeling_words = topic_modeling_words.collect()
topic_modeling_dict = {}
for row in local_topic_modeling_words:
    topic_modeling_dict[row.word] = row
    
topic_modeling_words_broadcast = sc.broadcast(topic_modeling_dict)

In [30]:
topic_modeling_words_broadcast.value['of']

Row(index=5, gini=0.7491159032394821, min_tfidf=0.20874546733691643, avg_tfidf=6.3232739184954445, max_tfidf=4904.20160405366, count_docs=47438, word=u'of')

In [7]:
print('Broadcasting SAVM topic modeling')
savm_topic_modeling_dict = {}
for i, row in enumerate(topic_modeling_savm.select([c for c in topic_modeling_savm.columns if c != 'words']).collect()):
    savm_topic_modeling_dict[row.sales_acct_id] = row
    if i % 100000 == 0:
        print(i)
        
savm_topic_bc = sc.broadcast(savm_topic_modeling_dict)

Broadcasting SAVM topic modeling
0
100000
200000
300000
400000
500000


In [12]:
topic_modeling_savm.take(1)

[Row(sales_acct_id=203855208.0, words=[u'denver', u'west', u'remediation', u'and', u'constr'], tf=SparseVector(480948, {27: 1.0, 151: 1.0, 1414: 1.0, 10367: 1.0, 32690: 1.0}), idf=SparseVector(480948, {27: 3.2599, 151: 4.9997, 1414: 7.2731, 10367: 8.8183, 32690: 10.232}), normalized_tfidf=SparseVector(480948, {27: 0.5433, 151: 0.8333, 1414: 1.2122, 10367: 1.4697, 32690: 1.7053}))]

In [10]:
savm_to_broadcast = drop_columns(savm_parsed, [
            'party_name', 'node_type', 'address3', 'address4', 'province', 'party_level', 'link_party_id',
            'account_sub_type', 'street_name', 'street_number', 'street_direction', 'street_type', 
            'postal_code_extn', 'tokenized_name'
        ])

In [None]:
columnized_savm = savm_parsed.groupby('sales_acct_id').agg(
    F.collect_list(['party_id']),
    F.collect_list(['parent_party_id']),
    F.collect_list(['address1']),
    F.collect_list(['address2']),
    F.collect_list(['address3']),
    F.collect_list(['address4']),
    F.collect_list(['city']),
    F.collect_list(['state']),
    F.collect_list(['postal_code']),
    F.collect_list(['country_code']),
    F.collect_list(['split_pct']),
    F.collect_list(['cleaned_name']),
)


In [None]:
columnized_savm.take(1)

In [None]:
savm_topic_bc.value[203697034.0]

In [16]:
full_savm.take(1)

[Row(sales_acct_id=253365514.0, party_ids=[212106996.0], hq_party_id_keys=[39038692.0], hq_party_id_values=[1], party_names=[u'bottega veneta singapore private limited'], tf=SparseVector(480948, {117: 1.0, 164: 1.0, 344: 1.0, 13800: 1.0, 16606: 1.0}), idf=SparseVector(480948, {117: 4.2422, 164: 4.0043, 344: 5.5017, 13800: 9.596, 16606: 10.4833}), normalized_tfidf=SparseVector(480948, {117: 0.707, 164: 0.6674, 344: 0.9169, 13800: 1.5993, 16606: 1.7472}))]

In [14]:
# just tie candidates to contest data (cr parties comes later)
elaborated_contest_data_grouped = direct_candidate_gen.select(['id', 'candidate_sales_acct_id']).drop_duplicates().join(party_expansion, on = 'id').map(lambda x : ((x.id, x.candidate_sales_acct_id), x)).cache()

In [17]:
candidates_with_cr = direct_candidate_gen.join(cr_parsed, on = F.col('candidate_party') == F.col('party_id')).repartition(2000, 'id').cache()

In [18]:
candidate_party_set = candidates_with_cr.map(lambda x : ((x.id, x.candidate_sales_acct_id), [x])).reduceByKey(lambda x, y : limited_collect(x, y, 10)).cache()

In [19]:
# these should be equal
print(elaborated_contest_data_grouped.count())
print(candidate_party_set.count())

4443066
4443066


In [35]:
candidate_party_set.take(1)

[((34359750349, 203799433.0),
  [Row(id=34359750349, candidate_party=45471, candidate_sales_acct_id=203799433.0, party_id=45471.0, parent_party_id=None, party_name=u'AT&T INC', node_type=u'HQ', address1=u'208 S AKARD ST', address2=u'FL 10', address3=None, address4=None, city=u'DALLAS', county=u'DALLAS', state=u'TX', province=None, postal_code=u'75202', postal_code_extn=u'2255', country_code=u'US', street_name=u'AKARD', street_number=u'208', street_direction=u'S', street_type=u'ST', cleaned_name=u'at&t inc', tokenized_name=[u'at&t', u'inc'])])]

In [None]:
elaborated_contest_data_grouped.filter(lambda x : x[0] == (68719482468, 281554718.0)).take(2)

In [20]:
elaborated_contest_data_grouped.join(candidate_party_set).saveAsPickleFile('temp_joined_all_11.pkl')

In [12]:
joined_candidates = sc.pickleFile('temp_joined_all_11.pkl')

In [13]:
candidate_count_tups = joined_candidates.map(lambda x : (x[0][0], 1)).reduceByKey(lambda x, y : x + y).collect()
print('Broadcasting Candidate Counts')
candidate_count_dict = {}
for k, v in candidate_count_tups:
    candidate_count_dict[k] = v
    
candidate_count_dict_bc = sc.broadcast(candidate_count_dict)

Broadcasting Candidate Counts


In [17]:
def add_sparse(vectors):
    values = defaultdict(float) # Dictionary with default value 0.0
    # Add values from v1
    for v in vectors:
        for i in range(v.indices.size):
            values[v.indices[i]] += v.values[i]
    return Vectors.sparse(vectors[0].size, dict(values))

def hstack_sparse(sparse_vectors):
    values = {}
    index = 0
    for vector in sparse_vectors:
        for i in range(vector.indices.shape[0]):
            values[vector.indices[i] + index] = vector.values[i]
        index += vector.size
    return Vectors.sparse(index, values)

def list_to_sparse(dense):
    values = {}
    for i, v in enumerate(dense):
        values[i] = v
    return SparseVector(len(dense), values)

In [18]:
def jaccard(set1, set2):
    union_length = len(set1.union(set2))
    if union_length == 0:
        return 0
    return float(len(set1.intersection(set2))) / union_length

def set_tokenize(string):
    if string == None:
        return []
    split = string.lower().replace(".", "").replace("-", " ").replace(",", "").split(" ")
    return set(split)

def equality_check(str1, str2):
    if str1 == None or str2 == None:
        return 0.5
    if str1 != str2:
        return 0
    return 1

# credits to wikibooks
def longest_common_substring(s1, s2):
    m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in xrange(1, 1 + len(s1)):
        for y in xrange(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return s1[x_longest - longest: x_longest]

def savm_topic_score(party_row, savm_topic):
    topic_scores = [Vectors.sparse(8, [0], [0])]
    
    unseen_words = 0
    for token in party_row.tokenized_name:
            
        #idk how to fix the out of bounds error
        if token in topic_modeling_words_broadcast.value:
            word_data = topic_modeling_words_broadcast.value[token]

            if word_data['index'] in savm_topic.tf.indices and savm_topic.tf[word_data['index']] > 0:

                topic_vector = [
                    word_data['count_docs'],
                    savm_topic.normalized_tfidf[word_data['index']],
                    savm_topic.tf[word_data['index']],
                    savm_topic.idf[word_data['index']],
                    word_data['gini'],
                    word_data['min_tfidf'],
                    word_data['avg_tfidf'],
                    word_data['max_tfidf'],
                ]
                topic_scores.append(list_to_sparse(topic_vector))
                
        unseen_words += 1

    if len(topic_scores) > 1:
        topic_scores = topic_scores[1:] # remove the placeholder
        
    scored_vectors = [(topic[0], topic) for topic in topic_scores]
    scored_vectors.sort(key = lambda x : x[0])
    
    final_subvector = hstack_sparse([concat_and_pad_vectors(scored_vectors), list_to_sparse([unseen_words])])
    
    return [min([point[0] for point in topic_scores]), final_subvector]

def best_string_scores(prior_party_name, end_customer_fixed_name, party_rows, savm, k = 5):
    jaccard_scores = []
    longest_run = 0
    longest_string = None
    
    party_names = [row.cleaned_name for row in party_rows] + [prior_party_name, end_customer_fixed_name]
    savm_names = [row.cleaned_name for row in savm]
    
    party_name_tokens = [set_tokenize(name) for name in party_names]
    savm_party_name_tokens = [row.tokenized for row in savm]
    
    for i in range(len(party_names)):
        for j in range(len(savm_names)):
            jaccard_scores.append(jaccard(party_name_tokens[i], savm_party_name_tokens[j]))
            #lcs = longest_common_substring(party_names[i], savm_names[j]).strip()
            #if len(lcs) > longest_run:
            #    longest_run = len(lcs)
            #    longest_string = lcs
    
    if len(jaccard_scores) < k:
        jaccard_scores = jaccard_scores + [0] * (k - len(jaccard_scores))
    
    #return list_to_sparse([jaccard_score, longest_run, len(longest_string.split(" "))])
    return list_to_sparse(jaccard_scores[:k])
            
def concat_and_pad_vectors(scored_vectors, k = 10):
    empty_vector = Vectors.sparse(len(scored_vectors[0][1]), {})
    
    combined_pairwise_vectors = []
    for i in range(k):
        if i < len(scored_vectors):
            combined_pairwise_vectors.append(scored_vectors[i][1])
        else:
            combined_pairwise_vectors.append(empty_vector)
    
    return hstack_sparse(combined_pairwise_vectors)

def featurize_pairwise(contest_data, party_rows, k = 10):
    
    savm_rows = savm_parsed_bc.value[contest_data.candidate_sales_acct_id]
    savm_topic = savm_topic_bc.value[contest_data.candidate_sales_acct_id]
    
    # cr_(all) * savm_(all)
    party_match_vector = [0] * len(party_values)
    hq_party_match_vector = [0] * len(party_values)
    
    hq_party_counter = Counter()
    for row in savm_rows:
        hq_party_counter[row.parent_party_id] += 1
    
    for i, party_value in enumerate(party_values):
        for j, row in enumerate(savm_rows):
            if row.party_id == contest_data[party_value]:
                party_match_vector[i] = 1
            hq_party_match_vector[i] = hq_party_counter[row.parent_party_id]
        
    party_match_vector = list_to_sparse(party_match_vector)
    hq_party_match_vector = list_to_sparse(hq_party_match_vector)
    
    # cr_i * savm_(all)
    scored_party_vectors = []
    for party_row in party_rows:
        # score, topic vector
        scored_party_vectors.append(savm_topic_score(party_row, savm_topic))

    scored_party_vectors.sort(key = lambda x : x[0], reverse = True)
    scored_party_vectors = concat_and_pad_vectors(scored_party_vectors, k = 5)
    
    fuzzy_vector = best_string_scores(contest_data.prior_party_name, contest_data.end_customer_line_fix, party_rows, savm_rows)    

    other_features = list_to_sparse([
            len(party_rows),
            len(savm_rows),
            candidate_count_dict_bc[contest_data.id]
        ])
    
    final_vector_stack = hstack_sparse([
                                fuzzy_vector,
                                party_match_vector, 
                                hq_party_match_vector, 
                                scored_party_vectors, 
                                other_features
        ])
    
    return (contest_data.id, contest_data.sales_acct_id, final_vector_stack)

In [15]:
tiny_joined = joined_candidates.repartition(2000).sample(False, 0.00001).cache()

In [17]:
tiny_joined.count()

47

In [19]:
tiny_joined.map(lambda x : (x[0][1], featurize_pairwise(x[1][0], x[1][1]))).take(5)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 49 in stage 26.0 failed 4 times, most recent failure: Lost task 49.3 in stage 26.0 (TID 22965, hdprd-c01-r06-06.cisco.com): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/mapr/tmp/hadoop-tmp/hadoop-mapr/nm-local-dir/usercache/hdpsndbx55/appcache/application_1480840206892_477358/container_e13_1480840206892_477358_01_000021/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/opt/mapr/tmp/hadoop-tmp/hadoop-mapr/nm-local-dir/usercache/hdpsndbx55/appcache/application_1480840206892_477358/container_e13_1480840206892_477358_01_000021/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/mapr/tmp/hadoop-tmp/hadoop-mapr/nm-local-dir/usercache/hdpsndbx55/appcache/application_1480840206892_477358/container_e13_1480840206892_477358_01_000021/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/mapr/spark/spark-1.6.1/python/pyspark/rdd.py", line 1293, in takeUpToNumLeft
  File "<ipython-input-19-93f9efab0684>", line 1, in <lambda>
  File "<ipython-input-18-fa6a3f4490d7>", line 110, in featurize_pairwise
KeyError: 203697034.0

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:744)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:393)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/mapr/tmp/hadoop-tmp/hadoop-mapr/nm-local-dir/usercache/hdpsndbx55/appcache/application_1480840206892_477358/container_e13_1480840206892_477358_01_000021/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/opt/mapr/tmp/hadoop-tmp/hadoop-mapr/nm-local-dir/usercache/hdpsndbx55/appcache/application_1480840206892_477358/container_e13_1480840206892_477358_01_000021/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/mapr/tmp/hadoop-tmp/hadoop-mapr/nm-local-dir/usercache/hdpsndbx55/appcache/application_1480840206892_477358/container_e13_1480840206892_477358_01_000021/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/mapr/spark/spark-1.6.1/python/pyspark/rdd.py", line 1293, in takeUpToNumLeft
  File "<ipython-input-19-93f9efab0684>", line 1, in <lambda>
  File "<ipython-input-18-fa6a3f4490d7>", line 110, in featurize_pairwise
KeyError: 203697034.0

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:744)


In [35]:
joined_candidates.take(1)

[((360777258232, 215394434.0),
  (Row(id=360777258232, candidate_sales_acct_id=215394434.0, end_customer_party_ssot_party_id_int_sav_party_id=12116286, prior_party_ssot_party_id_int_sav_party_id=264446441, sol_branch_party=12116286, sol_gu_party=33661, sol_hq_party=33661, order_level_branch_party=12116286, order_level_gu_party=33661, order_level_hq_party=33661, line_level_branch_party=12116286, line_level_gu_party=33661, line_level_hq_party=33661, ship_to_branch_party=264446441, ship_to_gu_party=264446441, ship_to_hq_party=264446441, bill_to_branch_party=156785794, bill_to_gu_party=147183, bill_to_hq_party=12228, sold_to_branch_party=12228, sold_to_gu_party=147183, sold_to_hq_party=6320533, sales_acct_id=203707095.0, prior_party_name=u'no ca', end_customer_line_fix=u'kaiser foundation health plan inc'),
   [Row(id=360777258232, party_id_candidate=38811247.0, candidate_sales_acct_id=215394434.0, party_id=38811247.0, parent_party_id=15501.0, party_name=u'SBC COMMUNICATIONS', node_type=u'

In [28]:
featured_pairs = joined_candidates.map(lambda x : (x[0][1], featurize_pairwise(x[1][0], x[1][1]))) \
                .map(lambda x : (x[0], x[1][0], x[1][1], x[1][2])) \
                .toDF(['truth_sales_acct_id', 'id', 'candidate_sales_acct_id', 'features'])

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 39.0 failed 4 times, most recent failure: Lost task 0.3 in stage 39.0 (TID 64203, hdprd-c01-r09-08.cisco.com): ExecutorLostFailure (executor 8 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits. 8.4 GB of 8 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:393)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)


In [40]:
featured_pairs = featured_pairs.withColumn('label', F.when(F.col('candidate_sales_acct_id') == F.col('truth_sales_acct_id'), 1).otherwise(0))

In [42]:
featured_pairs.write.saveAsTable('ignite.training_set_direct_id', mode = 'overwrite')

In [None]:
training_set_id = sqlContext.sql('select * from ignite.training_set_id')

In [3]:
training_set_id.where(F.col('id') == 207467426.0)

NameError: name 'F' is not defined