In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
from IPython.display import display

import operator
from collections import defaultdict, Counter
from sklearn import metrics

In [2]:
sqlContext.sql("set spark.sql.shuffle.partitions=5000")

DataFrame[key: string, value: string]

In [3]:
party_values = [
    'end_customer_party_ssot_party_id_int_sav_party_id',
    'prior_party_ssot_party_id_int_sav_party_id',
    'sol_branch_party',
    'sol_gu_party',
    'sol_hq_party',
    'order_level_branch_party',
    'order_level_gu_party',
    'order_level_hq_party',
    'line_level_branch_party',
    'line_level_gu_party',
    'line_level_hq_party',
    'ship_to_branch_party',
    'ship_to_gu_party',
    'ship_to_hq_party',
    'bill_to_branch_party',
    'bill_to_gu_party',
    'bill_to_hq_party',
    'sold_to_branch_party',
    'sold_to_gu_party',
    'sold_to_hq_party'
]

address_fields = [
    'address1', 'address2', 'address3', 'address4',
    'city', 'county', 'state', 'postal_code', 'street_name',
    'street_number', 'street_direction', 'street_type'
]


In [4]:
savm_parsed = sqlContext.sql("select * from ignite.savm_parsed").repartition(100).cache()
#temp because hadoop disk can't keep up...
party_expansion = sqlContext.sql("select * from ignite.party_expansion_temp").select(party_values +
        [
            'id', 'sales_acct_id', 'prior_party_name', 'end_customer_line_fix'
        ]).cache()

In [132]:
party_expansion.take(1)

[Row(end_customer_party_ssot_party_id_int_sav_party_id=217501313, prior_party_ssot_party_id_int_sav_party_id=217501313, sol_branch_party=217501313, sol_gu_party=2346701, sol_hq_party=2346701, order_level_branch_party=217501313, order_level_gu_party=2346701, order_level_hq_party=2346701, line_level_branch_party=217501313, line_level_gu_party=2346701, line_level_hq_party=2346701, ship_to_branch_party=217501313, ship_to_gu_party=2346701, ship_to_hq_party=2346701, bill_to_branch_party=128171, bill_to_gu_party=128171, bill_to_hq_party=128171, sold_to_branch_party=128171, sold_to_gu_party=128171, sold_to_hq_party=128171, id=68719483849, sales_acct_id=203728339.0, prior_party_name=u'j2 global communications', end_customer_line_fix=u'j2 global communications')]

In [5]:
id_candidate_gen = sqlContext.sql('select * from ignite.direct_id_candidates').unionAll(sqlContext.sql('select * from ignite.parent_expansion_candidates')).repartition(1000).drop_duplicates().cache()
lsh_candidate_gen = sqlContext.sql('select * from ignite.lsh_savm_candidates_2').cache()
candidate_match_status = sqlContext.sql('select * from ignite.candidate_match_status').cache()
topic_modeling_savm = sqlContext.sql('select * from ignite.topic_modeling_savm_tfidf')
topic_modeling_words = sqlContext.sql('select * from ignite.topic_modeling_per_word').cache()
cr_parsed = sqlContext.sql("select * from ignite.cr_parsed").repartition(500).cache()

In [8]:
# thin out whatever we can
def drop_columns(df, columns):
    return df.select([c for c in df.columns if c not in columns])

savm_parsed = drop_columns(savm_parsed, ['geo_valid_status', 'completenes_status', 'cleansed_status', 'start_date', 'end_date',
                          'program_id', 'request_id', 'created_by', 'last_updated_by', 'creation_date', 'last_update_date',
                          'certified_date', 'site_expl_id', 'conflict_batch_id', 'sa_member_id', 'parent_sa_member_id',
                          'link_party_type', 'account_type', 'operation_type'
                    ])
cr_parsed = drop_columns(cr_parsed, ['geo_valid_status', 'completenes_status', 'cleansed_status', 'start_date', 'end_date',
                          'program_id', 'request_id', 'created_by', 'last_updated_by', 'creation_date', 'last_update_date',
                          'certified_date'
                    ])


In [9]:
columnized_savm = sqlContext.sql('select * from ignite.temp_columnized_savm')

In [17]:
collect_sales_acct_ids = columnized_savm.map(lambda x : x.savm_sales_acct_id).distinct().collect()

In [20]:
index_mapping = []
for i, sales_acct_id in enumerate(collect_sales_acct_ids):
    index_mapping.append((i, sales_acct_id))
sqlContext.createDataFrame(pd.DataFrame(index_mapping, columns = ['broadcast_index', 'savm_sales_acct_id'])).write.saveAsTable('ignite.broadcast_index')

In [21]:
broadcast_index_mapping = sqlContext.sql('select * from ignite.broadcast_index').map(lambda x : [x.broadcast_index, x.savm_sales_acct_id]).collect()

In [23]:
index_mapping = {}
for i, sales_acct_id in broadcast_index_mapping:
    index_mapping[sales_acct_id] = i

In [45]:
savm_array_party_id = {}
for row in columnized_savm.collect():
    savm_array_party_id[index_mapping[row.savm_sales_acct_id]] = ' '.join([str(party_id) for party_id in row.savm_party_ids])
    
savm_broadcast_party_id = sc.broadcast(savm_array_party_id)

In [10]:
local_savm_dicts = defaultdict(dict)
keys = columnized_savm.take(1)[0].asDict().keys()
for row in columnized_savm.collect():
    for key in keys:
        local_savm_dicts[key][row.savm_sales_acct_id] = row[key]

savm_broadcasts = {}
for key in keys:
    savm_broadcasts[key] = sc.broadcast(local_savm_dicts[key])

In [26]:
joined_candidates = sqlContext.sql('select * from ignite.temp_joined_candidates_2')

In [27]:
def add_sparse(vectors):
    values = defaultdict(float) # Dictionary with default value 0.0
    # Add values from v1
    for v in vectors:
        for i in range(v.indices.size):
            values[v.indices[i]] += v.values[i]
    return Vectors.sparse(vectors[0].size, dict(values))

def hstack_sparse(sparse_vectors):
    values = {}
    index = 0
    for vector in sparse_vectors:
        for i in range(vector.indices.shape[0]):
            values[vector.indices[i] + index] = vector.values[i]
        index += vector.size
    return Vectors.sparse(index, values)

def list_to_sparse(dense):
    values = {}
    for i, v in enumerate(dense):
        values[i] = v
    return SparseVector(len(dense), values)

In [35]:
def jaccard(set1, set2):
    union_length = len(set1.union(set2))
    if union_length == 0:
        return 0
    return float(len(set1.intersection(set2))) / union_length

def set_tokenize(string):
    if string == None:
        return []
    split = string.lower().replace(".", "").replace("-", " ").replace(",", "").split(" ")
    return set(split)

def equality_check(str1, str2):
    if str1 == None or str2 == None:
        return 0.5
    if str1 != str2:
        return 0
    return 1

# credits to wikibooks
def longest_common_substring(s1, s2):
    m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in xrange(1, 1 + len(s1)):
        for y in xrange(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return s1[x_longest - longest: x_longest]

def savm_topic_score(cleaned_name, savm_topic):
    topic_scores = [Vectors.sparse(8, [0], [0])]
    
    unseen_words = 0
    
    for token in set_tokenize(cleaned_name):
            
        #idk how to fix the out of bounds error
        if token in topic_modeling_words_broadcast.value:
            word_data = topic_modeling_words_broadcast.value[token]

            if word_data['index'] in savm_topic.tf.indices and savm_topic.tf[word_data['index']] > 0:

                topic_vector = [
                    word_data['count_docs'],
                    savm_topic.normalized_tfidf[word_data['index']],
                    savm_topic.tf[word_data['index']],
                    word_data['gini'],
                    word_data['min_tfidf'],
                    word_data['avg_tfidf'],
                    word_data['max_tfidf'],
                ]
                topic_scores.append(list_to_sparse(topic_vector))
                
        unseen_words += 1

    if len(topic_scores) > 1:
        topic_scores = topic_scores[1:] # remove the placeholder
        
    scored_vectors = [(topic[0], topic) for topic in topic_scores]
    scored_vectors.sort(key = lambda x : x[0])
    
    final_subvector = hstack_sparse([concat_and_pad_vectors(scored_vectors), list_to_sparse([unseen_words])])
    
    return [min([point[0] for point in topic_scores]), final_subvector]

def best_string_scores(full_row,  k = 5):
    jaccard_scores = []
    longest_run = 0
    longest_string = None
    
    savm_cleaned_name = savm_broadcasts['savm_cleaned_name'].value[full_row.candidate_sales_acct_id_right]
    
    party_names = full_row.cr_cleaned_name + [full_row.prior_party_name, full_row.end_customer_line_fix]
    
    party_name_tokens = [set_tokenize(name) for name in party_names]
    savm_party_name_tokens = [set_tokenize(name) for name in savm_cleaned_name]
    
    for i in range(len(party_names)):
        for j in range(len(savm_cleaned_name)):
            jaccard_scores.append(jaccard(party_name_tokens[i], savm_party_name_tokens[j]))
            #lcs = longest_common_substring(party_names[i], savm_names[j]).strip()
            #if len(lcs) > longest_run:
            #    longest_run = len(lcs)
            #    longest_string = lcs
    
    if len(jaccard_scores) < k:
        jaccard_scores = jaccard_scores + [0] * (k - len(jaccard_scores))
    
    #return list_to_sparse([jaccard_score, longest_run, len(longest_string.split(" "))])
    return list_to_sparse(jaccard_scores[:k])
            
def concat_and_pad_vectors(scored_vectors, k = 10):
    empty_vector = Vectors.sparse(len(scored_vectors[0][1]), {})
    
    combined_pairwise_vectors = []
    for i in range(k):
        if i < len(scored_vectors):
            combined_pairwise_vectors.append(scored_vectors[i][1])
        else:
            combined_pairwise_vectors.append(empty_vector)
    
    return hstack_sparse(combined_pairwise_vectors)

def featurize_party_match_vector(full_row, k = 10):

    #savm_parent_party_ids = savm_broadcasts['savm_parent_party_ids'].value[full_row.candidate_sales_acct_id_right]
    savm_party_ids = savm_broadcast_party_ids.value[full_row.candidate_sales_acct_id_right]
    return None
    savm_parent_party_id_counter = Counter()
    for parent_party_id in savm_parent_party_ids:
        savm_parent_party_id_counter[parent_party_id] += 1
    
    party_match_vector = [0] * len(party_values)
    hq_party_match_vector = [0] * len(party_values)
    for i, party_value in enumerate(party_values):
        party_match_vector[i] = 1 if full_row[party_value] in savm_parent_party_ids else 0
        for j, key in enumerate(savm_parent_party_ids):
            if key == full_row[party_value] and key != None:
                hq_party_match_vector[i] = savm_parent_party_id_counter[key]
        
    party_match_vector = list_to_sparse(party_match_vector)
    hq_party_match_vector = list_to_sparse(hq_party_match_vector)
    
        # count the number of candidates that are direct or hq matching
    
    num_party_match = 0
    num_hq_party_match = 0
    
    savm_party_ids = savm_broadcasts['savm_party_ids'].value[full_row.candidate_sales_acct_id_right]

    for cr_party in full_row.cr_party_ids:
        for i, savm_party in enumerate(savm_party_ids):
            if cr_party == savm_party:
                num_party_match += 1
            if cr_party == savm_parent_party_ids[i]:
                num_hq_party_match += 1
    
    cr_savm_party_match_vector = list_to_sparse([num_party_match, num_hq_party_match])
    
    other_features = list_to_sparse([
        len(full_row.cr_party_ids),
        len(savm_party_ids)
    ])

    final_vector_stack = hstack_sparse([
            other_features,
            party_match_vector,
            hq_party_match_vector,
            cr_savm_party_match_vector
        ])
    
    return (full_row.id, full_row.candidate_sales_acct_id_right, final_vector_stack)

def featurize_topic_vector(full_row, k = 10):

    # cr_i * savm_(all)
    savm_topic = savm_topic_bc.value[full_row.candidate_sales_acct_id_right]
    
    scored_party_vectors = []
    for cr_party_cleaned_name in full_row.cr_cleaned_name:
        # score, topic vector
        scored_party_vectors.append(savm_topic_score(cr_party_cleaned_name, savm_topic))

    scored_party_vectors.sort(key = lambda x : x[0], reverse = True)
    scored_party_vectors = concat_and_pad_vectors(scored_party_vectors, k = 5)
    
    fuzzy_vector = best_string_scores(full_row)    
    
    final_vector_stack = hstack_sparse([
                                fuzzy_vector,
                                scored_party_vectors, 
        ])
    return (full_row.id, full_row.candidate_sales_acct_id_right, final_vector_stack)

In [29]:
sample = joined_candidates.sample(False, 0.001).repartition(1000).cache()

In [36]:
sample = sample.join(sqlContext.sql('select * from ignite.broadcast_index'), on = F.col('candidate_sales_acct_id_right') == F.col('savm_sales_acct_id'))

AnalysisException: u"Reference 'savm_sales_acct_id' is ambiguous, could be: savm_sales_acct_id#785, savm_sales_acct_id#1367.;"

In [32]:
sample.count()

17044

In [46]:
featured_pairs = sample.map(featurize_party_match_vector).cache()#.toDF(['id', 'candidate_sales_acct_id', 'feature'])

In [47]:
featured_pairs.count()

17044

In [None]:
featured_pairs = joined_candidates.map(featurize_pairwise).toDF(['id', 'candidate_sales_acct_id', 'feature'])

In [28]:
id_truth_mapping = party_expansion.select(['id', F.col('sales_acct_id').alias('truth_sales_acct_id')]).drop_duplicates()
builder = featured_pairs.join(id_truth_mapping, on = 'id', how = 'left')
builder = builder.withColumn('label', F.when(F.col('candidate_sales_acct_id') == F.col('truth_sales_acct_id'), 1).otherwise(0))

In [24]:
builder.count()

1691

In [29]:
builder.write.saveAsTable('ignite.training_set_lsh', mode = 'overwrite')

In [None]:
training_set_id = sqlContext.sql('select * from ignite.training_set_id')

In [18]:
sqlContext.sql('drop table ignite.training_set_lsh')

DataFrame[]