In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
from IPython.display import display

import operator
from collections import defaultdict, Counter
from sklearn import metrics

In [2]:
sqlContext.sql("set spark.sql.shuffle.partitions=5000")

DataFrame[key: string, value: string]

In [3]:
party_values = [
    'end_customer_party_ssot_party_id_int_sav_party_id',
    'prior_party_ssot_party_id_int_sav_party_id',
    'sol_branch_party',
    'sol_gu_party',
    'sol_hq_party',
    'order_level_branch_party',
    'order_level_gu_party',
    'order_level_hq_party',
    'line_level_branch_party',
    'line_level_gu_party',
    'line_level_hq_party',
    'ship_to_branch_party',
    'ship_to_gu_party',
    'ship_to_hq_party',
    'bill_to_branch_party',
    'bill_to_gu_party',
    'bill_to_hq_party',
    'sold_to_branch_party',
    'sold_to_gu_party',
    'sold_to_hq_party'
]

address_fields = [
    'address1', 'address2', 'address3', 'address4',
    'city', 'county', 'state', 'postal_code', 'street_name',
    'street_number', 'street_direction', 'street_type'
]


In [4]:
savm_parsed = sqlContext.sql("select * from ignite.savm_parsed").repartition(100).cache()
#temp because hadoop disk can't keep up...
party_expansion = sqlContext.sql("select * from ignite.party_expansion_temp").select(party_values +
        [
            'id', 'sales_acct_id', 'prior_party_name', 'end_customer_line_fix'
        ]).cache()

In [5]:
lsh_candidates = sqlContext.sql('select * from ignite.lsh_savm_candidates_2').withColumn('candidate_party_id', F.col('party_id_candidate'))
direct_candidates = sqlContext.sql('select * from ignite.direct_id_candidates').withColumn('candidate_party_id', F.col('candidate_party'))
columns = ['id', 'candidate_sales_acct_id', 'candidate_party_id']
candidate_gen = lsh_candidates.select(columns).unionAll(direct_candidates.select(columns))

In [6]:
topic_modeling_savm = sqlContext.sql('select * from ignite.topic_modeling_savm_tfidf')
topic_modeling_words = sqlContext.sql('select * from ignite.topic_modeling_per_word').cache()
cr_parsed = sqlContext.sql("select * from ignite.cr_parsed").repartition(500).cache()

In [7]:
# thin out whatever we can
def drop_columns(df, columns):
    return df.select([c for c in df.columns if c not in columns])

savm_parsed = drop_columns(savm_parsed, ['geo_valid_status', 'completenes_status', 'cleansed_status', 'start_date', 'end_date',
                          'program_id', 'request_id', 'created_by', 'last_updated_by', 'creation_date', 'last_update_date',
                          'certified_date', 'site_expl_id', 'conflict_batch_id', 'sa_member_id', 'parent_sa_member_id',
                          'link_party_type', 'account_type', 'operation_type'
                    ])
cr_parsed = drop_columns(cr_parsed, ['geo_valid_status', 'completenes_status', 'cleansed_status', 'start_date', 'end_date',
                          'program_id', 'request_id', 'created_by', 'last_updated_by', 'creation_date', 'last_update_date',
                          'certified_date'
                    ])


In [64]:

builder = savm_parsed.fillna(-1, ['party_id', 'parent_party_id'])
builder = builder.fillna("", ['address1', 'address2', 'city', 'state', 'postal_code', 'country_code', 'cleaned_name'])

columnized_savm = builder.groupby('sales_acct_id').agg(
    F.collect_list('party_id').alias('savm_party_ids'),
    F.collect_list('parent_party_id').alias('savm_parent_party_ids'),
    F.collect_list('address1').alias('savm_address_1'),
    F.collect_list('address2').alias('savm_address_2'),
    F.collect_list('city').alias('savm_city'),
    F.collect_list('state').alias('savm_state'),
    F.collect_list('postal_code').alias('savm_postal_code'),
    F.collect_list('country_code').alias('savm_country_code'),
    F.collect_list('split_pct').alias('savm_split_pct'),
    F.collect_list('cleaned_name').alias('savm_cleaned_name'),
)

def trim_list(x, k = 100):
    if len(x) > k:
        return x[:k]
    return x

def select_columns(columnized_savm):
    columns = []
    for c in columnized_savm.columns:
        if c == 'sales_acct_id':
            columns.append(F.col('sales_acct_id').alias('savm_sales_acct_id'))
        else:
            columns.append(c)
    return columns

def apply_function(df, fields, function):
    for field in fields:
        df = df.withColumn(field, function(field))
    return df

columnized_savm = columnized_savm.select(select_columns(columnized_savm))


In [65]:
columnized_savm = apply_function(columnized_savm, ['savm_party_ids', 'savm_parent_party_ids'], lambda c : F.udf(trim_list, ArrayType(IntegerType()))(F.col(c).cast(ArrayType(IntegerType()))).alias(c))
columnized_savm = apply_function(columnized_savm, [
        'savm_address_1', 'savm_address_2', 'savm_city', 'savm_state',
        'savm_postal_code', 'savm_country_code', 
        'savm_split_pct', 'savm_cleaned_name'
    ], lambda c : F.udf(trim_list, ArrayType(StringType()))(F.col(c)).alias(c))
columnized_savm = columnized_savm.repartition(2000, 'savm_sales_acct_id')

In [66]:
columnized_savm.where(F.col('savm_sales_acct_id') == 203850076.0).take(5)

[Row(savm_sales_acct_id=203850076.0, savm_party_ids=[43490895, 232764067, 281649435, 280839189, 231861733, 239847484, 122792312, 250486581, 232935868, 244994938, 234635568, 246804156, 44142969, 233580088, 135492702, 254747825, 245749845, 224895319, 204222472, 247309862, 240752903, 246400438, 158016724, 269138341, 235363178, 138559180, 232137898, 234644335, 87536854, 205029960, 258355630, 234622741, 271361593, 256874456, 233674402, 211815965, 249313159, 280809386, 235868837, 244439617, 185479220, 274273679, 247314251, 281395787, 271741272, 269988782, 247551788, 44701422, 246637068, 170671506, 256242944, 253856154, 255665078, 181209640, 158843395, 43490847, 255079090, 45505862, 235367192, 10795311, 9351509, 270906855, 32084038, 232621400, 228900124, 270703843, 111707483, 115418156, 245839334, 269619617, 256103028, 83671198, 248945881, 275003045, 248324373, 255665465, 238165170, 270841034, 9872712, 84884762, 112372717, 4880293, 232396179, 233522272, 222744242, 257903413, 266389002, 236981

In [8]:
columnized_savm.count()

500019

In [67]:
columnized_savm.write.saveAsTable('ignite.temp_columnized_savm', mode = 'overwrite')

In [68]:
columnized_savm = sqlContext.sql('select * from ignite.temp_columnized_savm')

In [12]:
local_topic_modeling_words = topic_modeling_words.collect()
topic_modeling_dict = {}
for row in local_topic_modeling_words:
    topic_modeling_dict[row.word] = row
    
topic_modeling_words_broadcast = sc.broadcast(topic_modeling_dict)

In [32]:
topic_modeling_words_broadcast.value['orange']

Row(index=333, norm_max=0.19117041108616217, min_tfidf=3.9981968494108535e-05, avg_tfidf=0.6507507212780906, max_tfidf=4.069136284938971, count_docs=403, word=u'orange')

In [89]:
def tf_reduction(row):
    threshold = len(row.words) / 100
    word_indices = []
    tf = []
    idf = []
    normalized_tfidf = []
    for index in range(len(row.tf.indices)):
        if row.tf.values[index] > threshold:
            word_indices.append(int(row.tf.indices[index]))
            tf.append(float(row.tf.values[index]))
            idf.append(float(row.idf.values[index]))
            normalized_tfidf.append(float(row.normalized_tfidf.values[index]))
    return row.sales_acct_id, word_indices, tf, idf, normalized_tfidf

topic_modeling_reduced = topic_modeling_savm.map(tf_reduction) \
                                .toDF(['candidate_sales_acct_id', 'word_indices', 'tf', 'idf', 'normalized_tfidf'])

In [15]:
# just tie candidates to contest data (cr parties comes later)
contest_data_grouped = candidate_gen.select(['id', 'candidate_sales_acct_id']).drop_duplicates().join(party_expansion, on = 'id').cache()

In [16]:
candidates_with_cr = candidate_gen.join(cr_parsed, on = F.col('candidate_party_id') == F.col('party_id')).repartition(2000, 'id')
candidates_with_cr = candidates_with_cr.fillna(-1, ['party_id', 'parent_party_id'])
candidates_with_cr = candidates_with_cr.fillna("", ['address1', 'address2', 'address3', 'address4', 'city', 'state', 'postal_code', 'country_code', 'cleaned_name'])

In [17]:
columnized_cr = candidates_with_cr.groupby('id', 'candidate_sales_acct_id').agg(
    F.collect_list('party_id').alias('cr_party_ids'),
    F.collect_list('parent_party_id').alias('cr_parent_party_ids'),
    F.collect_list('address1').alias('cr_address1'),
    F.collect_list('address2').alias('cr_address2'),
    F.collect_list('address3').alias('cr_address3'),
    F.collect_list('address4').alias('cr_address4'),
    F.collect_list('city').alias('cr_city'),
    F.collect_list('state').alias('cr_state'),
    F.collect_list('postal_code').alias('cr_postal_code'),
    F.collect_list('country_code').alias('cr_country_code'),
    F.collect_list('cleaned_name').alias('cr_cleaned_name'),
)

columnized_cr = columnized_cr.repartition(2000).cache()

In [14]:
columnized_cr

DataFrame[id: bigint, candidate_sales_acct_id: double, cr_party_ids: array<double>, cr_parent_party_ids: array<double>, cr_address1: array<string>, cr_address2: array<string>, cr_address3: array<string>, cr_address4: array<string>, cr_city: array<string>, cr_state: array<string>, cr_postal_code: array<string>, cr_country_code: array<string>, cr_cleaned_name: array<string>]

In [18]:
joined_candidates = columnized_cr.join(contest_data_grouped, on = ['id', 'candidate_sales_acct_id']).cache()
joined_candidates = joined_candidates.withColumn('truth_sales_acct_id', F.col('sales_acct_id')).drop('sales_acct_id')

In [19]:
def add_sparse(vectors):
    values = defaultdict(float) # Dictionary with default value 0.0
    # Add values from v1
    for v in vectors:
        for i in range(v.indices.size):
            values[v.indices[i]] += v.values[i]
    return Vectors.sparse(vectors[0].size, dict(values))

def hstack_sparse(sparse_vectors):
    values = {}
    index = 0
    for vector in sparse_vectors:
        for i in range(vector.indices.shape[0]):
            values[vector.indices[i] + index] = vector.values[i]
        index += vector.size
    return Vectors.sparse(index, values)

def list_to_sparse(dense):
    values = {}
    for i, v in enumerate(dense):
        values[i] = v
    return SparseVector(len(dense), values)

In [35]:
def jaccard(set1, set2):
    union_length = len(set1.union(set2))
    if union_length == 0:
        return 0
    return float(len(set1.intersection(set2))) / union_length

def set_tokenize(string):
    if string == None:
        return []
    split = string.lower().replace(".", "").replace("-", " ").replace(",", "").split(" ")
    return set(split)

def equality_check(str1, str2):
    if str1 == None or str2 == None:
        return 0.5
    if str1 != str2:
        return 0
    return 1

# credits to wikibooks
def longest_common_substring(s1, s2):
    m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in xrange(1, 1 + len(s1)):
        for y in xrange(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return s1[x_longest - longest: x_longest]

def savm_topic_score(cleaned_name, full_row, word_index_mapping):
    topic_scores = [Vectors.sparse(8, [0], [0])]
    
    unseen_words = 0
    words_not_in_savm = 0
    
    tokens = set_tokenize(cleaned_name)
    
    for token in tokens:
            
        #idk how to fix the out of bounds error
        if token in topic_modeling_words_broadcast.value:
            word_data = topic_modeling_words_broadcast.value[token]

            if word_data['index'] in full_row.word_indices:
                straight_vector_index = word_index_mapping[word_data['index']]
                topic_vector = [
                    word_data['count_docs'],
                    full_row.normalized_tfidf[straight_vector_index],
                    full_row.tf[straight_vector_index],
                    full_row.idf[straight_vector_index],
                    word_data['norm_max'],
                    word_data['min_tfidf'],
                    word_data['avg_tfidf'],
                    word_data['max_tfidf'],
                ]
                topic_scores.append(list_to_sparse(topic_vector))
            else:
                words_not_in_savm += 1
        else:
            unseen_words += 1

    if len(topic_scores) > 1:
        topic_scores = topic_scores[1:] # remove the placeholder
        
    scored_vectors = [(topic[0], topic) for topic in topic_scores]
    scored_vectors.sort(key = lambda x : x[0])
    
    final_subvector = hstack_sparse([concat_and_pad_vectors(scored_vectors), list_to_sparse([unseen_words, words_not_in_savm, len(tokens)])])
    
    return [min([point[0] for point in topic_scores]), final_subvector]

def best_string_scores(full_row, k = 5):
    jaccard_scores = []
    longest_run = 0
    longest_string = None
    
    savm_cleaned_names = full_row.savm_cleaned_name
    
    party_names = full_row.cr_cleaned_name + [full_row.prior_party_name, full_row.end_customer_line_fix]
    
    party_name_tokens = [set_tokenize(name) for name in party_names]
    savm_party_name_tokens = [set_tokenize(name) for name in savm_cleaned_names]
    
    for i in range(len(party_names)):
        for j in range(len(savm_cleaned_names)):
            jaccard_scores.append(jaccard(party_name_tokens[i], savm_party_name_tokens[j]))
            #lcs = longest_common_substring(party_names[i], savm_names[j]).strip()
            #if len(lcs) > longest_run:
            #    longest_run = len(lcs)
            #    longest_string = lcs
    
    if len(jaccard_scores) < k:
        jaccard_scores = jaccard_scores + [0] * (k - len(jaccard_scores))
    
    #return list_to_sparse([jaccard_score, longest_run, len(longest_string.split(" "))])
    return list_to_sparse(jaccard_scores[:k])

def address_similarity_score(full_row):
    # really basic stuff
    matching_zips = 0
    for cr_zip_code in full_row.cr_postal_code:
        if cr_zip_code in full_row.savm_postal_code:
            matching_zips += 1
    
    for cr_address_1 in full_row.cr_address_1:
        if cr_address_1 in full_row.savm_address_1:
            matching_savms += 1
    
    return list_to_sparse([matching_zips, matching_savms])
    # zip code
    '''
    zip_code_match = [-1] * 5
    savm_zip_counter = Counter()
    for zip_code in full_row.savm_postal_code:
        for zip_index in range(1, len(zip_code)):
            savm_zip_counter[zip_index[:zip_index]] += 1
    
    for zip_code in full_row.cr_postal_code:
        for zip_index in range(1, len(zip_code) - 1):
            pass
    '''
            
def concat_and_pad_vectors(scored_vectors, k = 10):
    empty_vector = Vectors.sparse(len(scored_vectors[0][1]), {})
    
    combined_pairwise_vectors = []
    for i in range(k):
        if i < len(scored_vectors):
            combined_pairwise_vectors.append(scored_vectors[i][1])
        else:
            combined_pairwise_vectors.append(empty_vector)
    
    return hstack_sparse(combined_pairwise_vectors)

def unravel_ints(raveled_string):
    ints = []
    for elem in raveled_string.split(","):
        if elem != '':
            ints.append(int(elem))
        else:
            ints.append(None)
    return ints

def unravel_floats(raveled_string):
    floats = []
    for elem in raveled_string.split(","):
        if elem != '':
            floats.append(float(elem))
        else:
            floats.append(None)
    return floats

def featurize_pairwise(full_row, k = 10):

    savm_parent_party_ids = full_row.savm_parent_party_ids
    
    savm_parent_party_id_counter = Counter(savm_parent_party_ids)
    
    party_match_vector = [0] * len(party_values)
    hq_party_match_vector = [0] * len(party_values)
    for i, party_value in enumerate(party_values):
        party_match_vector[i] = 1 if full_row[party_value] in savm_parent_party_ids else 0
        for j, key in enumerate(savm_parent_party_id_counter.keys()):
            if key == full_row[party_value] and key != None:
                hq_party_match_vector[i] = savm_parent_party_id_counter[key]
        
    party_match_vector = list_to_sparse(party_match_vector)
    hq_party_match_vector = list_to_sparse(hq_party_match_vector)
    
    # cr_i * savm_i
    # count the number of candidates that are direct or hq matching
    
    num_party_match = 0
    num_hq_party_match = 0
    
    savm_party_ids = full_row.savm_party_ids
    savm_party_ids_set = set(savm_party_ids)
    
    for cr_party in full_row.cr_party_ids:
        if cr_party in savm_party_ids_set:
            num_party_match += 1
        if cr_party in savm_parent_party_id_counter.keys():
            num_hq_party_match += 1
    
    cr_savm_party_match_vector = list_to_sparse([num_party_match, num_hq_party_match])
    
    # cr_i * savm_(all)

    word_index_mapping = {}
    for i, index in enumerate(full_row.word_indices):
        word_index_mapping[index] = i
    
    scored_party_vectors = []
    for cr_party_cleaned_name in full_row.cr_cleaned_name:
        scored_party_vectors.append(savm_topic_score(cr_party_cleaned_name, full_row, word_index_mapping))
    
    scored_party_vectors.sort(key = lambda x : x[0], reverse = True)
    scored_party_vectors = concat_and_pad_vectors(scored_party_vectors, k = 10)
    
    fuzzy_vector = best_string_scores(full_row)
    
    # cr_(all) * savm_(all)
    
    address_vector = address_similarity_score(full_row)
    
    other_features = list_to_sparse([
        len(full_row.cr_party_ids),
        len(savm_party_ids)
    ])

    final_vector_stack = hstack_sparse([
            party_match_vector,
            hq_party_match_vector,
            cr_savm_party_match_vector,
            fuzzy_vector,
            scored_party_vectors,
            address_vector,
            other_features
        ])
    
    return (full_row.id, full_row.candidate_sales_acct_id, full_row.truth_sales_acct_id, final_vector_stack)

In [21]:
sample = joined_candidates.sample(False, 0.01).repartition(1000).cache()

In [21]:
sample.map(featurize_pairwise).count()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job 6 cancelled because Stage 7 was cancelled
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1370)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleStageCancellation$1.apply$mcVI$sp(DAGScheduler.scala:1358)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleStageCancellation$1.apply(DAGScheduler.scala:1357)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleStageCancellation$1.apply(DAGScheduler.scala:1357)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofInt.foreach(ArrayOps.scala:156)
	at org.apache.spark.scheduler.DAGScheduler.handleStageCancellation(DAGScheduler.scala:1357)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1613)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)


In [26]:
sample_joined = sample.join(topic_modeling_reduced, on = 'candidate_sales_acct_id').repartition(1000)

In [199]:
sample_joined.count()

170770

In [36]:
featured_pairs = sample_joined.map(featurize_pairwise).cache()#.toDF(['id', 'candidate_sales_acct_id', 'feature'])

In [90]:
full_joined = joined_candidates.join(topic_modeling_reduced, on = 'candidate_sales_acct_id')
full_joined = full_joined.join(columnized_savm, on = F.col('candidate_sales_acct_id') == F.col('savm_sales_acct_id')).repartition(5000)

In [91]:
# forcing disk resolves some memory issues
full_joined.write.saveAsTable('ignite.temp_final_step', mode = 'overwrite')
#final_step = sqlContext.sql('select * from ignite.temp_final_step_direct_id')

In [4]:
final_step = sqlContext.sql('select * from ignite.temp_final_step')

In [104]:
featured_pairs = final_step.map(featurize_pairwise).toDF(['id', 'candidate_sales_acct_id', 'truth_sales_acct_id', 'features'])

In [105]:
featured_pairs.write.saveAsTable('ignite.training_set_direct_lsh', mode = 'overwrite')

In [2]:
features = sqlContext.sql("select * from ignite.training_set_direct_lsh")

In [None]:
final_step.where(F.col('id') == 42949685346).where(F.col('candidate_sales_acct_id') == 203768691.0).take(2)

In [3]:
features.sample(False, 0.00001).take(5)

[Row(id=180388636079, candidate_sales_acct_id=203715470.0, truth_sales_acct_id=203744058.0, features=SparseVector(879, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0, 37: 0.0, 38: 0.0, 39: 0.0, 40: 1.0, 41: 0.0, 42: 0.0, 43: 0.0, 44: 0.5, 45: 0.0, 46: 0.0, 47: 28.0, 48: 2.6102, 49: 217.0, 50: 2116.8583, 51: 0.3178, 52: 0.0003, 53: 1.5604, 54: 3.2517, 55: 2594.0, 56: 0.5838, 57: 90.0, 58: 473.4955, 59: 0.1007, 60: 0.0001, 61: 0.3026, 62: 2.6305, 127: 0.0, 128: 0.0, 129: 2.0, 877: 1.0, 878: 100.0})),
 Row(id=42949685346, candidate_sales_acct_id=203768691.0, truth_sales_acct_id=203842177.0, features=SparseVector(879, {0: 0.0, 1: 0.0, 2: 0.0, 3: 1.0, 4: 1.0, 5: 0.0, 6: 1.0, 7: 1.0, 8: 0.0, 9: 1.0, 10: 1.0, 11: 0.0, 12: