In [6]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
import nltk

In [7]:
sqlContext.sql("set spark.sql.shuffle.partitions=50")

DataFrame[key: string, value: string]

In [1]:
cr = sqlContext.sql('select * from edso_ignite.contest_savm').repartition(50).cache()

In [2]:
cr.take(10)

[Row(party_id=10698.0, parent_party_id=146850.0, party_name=u'MARUBENI AMERICA CORPORATION', node_type=u'HQ', address1=u'375 LEXINGTON AVE', address2=None, address3=None, address4=None, city=u'NEW YORK', county=u'NEW YORK', state=u'NY', province=None, postal_code=u'10017', postal_code_extn=u'3914', country_code=u'US', street_name=u'LEXINGTON', street_number=u'375', street_direction=None, street_type=u'AVE', geo_valid_status=u'GEO_VALID', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2002-11-11 22:12:45.0', end_date=u'4712-12-31 00:00:00.0', program_id=46063.0, request_id=103145420.0, created_by=1116.0, last_updated_by=316142.0, creation_date=u'2002-11-11 22:12:45.0', last_update_date=u'2014-02-15 05:43:29.0', certified_date=u'2014-02-15 05:43:29.0', site_expl_id=374542503.0, conflict_batch_id=51527995.0, sa_member_id=922275.0, parent_sa_member_id=-1.0, party_level=1.0, link_party_id=10698.0, link_party_type=u'ORGANIZATION', split_pct=100.0, sales_acct_id=203

In [4]:
def apply_function(df, fields, function):
    column_objects = []
    for column in df.columns:
        if column in fields:
            column_objects.append(function(column))
        else:
            column_objects.append(column)
    return df.select(column_objects)

def tokenize(column):
    def parse(d):
        if d == None:
            return ""
        elems = d.replace("(", " ").replace(")", " ").replace("-", " ").replace(',', ' ').split(" ")
        elems = ' '.join([elem.strip() for elem in elems])
        return elems + " "
    return F.udf(parse, StringType())(F.col(column)).alias(column)

def full_concatenation(space_delims):
    full_list = []
    for elems in space_delims.split(" "):
        full_list += elems
    return full_list

In [8]:
string_fields = ['party_name', 'address1', 'address2', 'address3', 'address4', 'city', 'county', 
                              'state', 'postal_code', 'street_name', 'street_number', 'street_direction', 'street_type']
builder = apply_function(cr, string_fields, tokenize)

In [9]:

builder = builder.withColumn("concat", F.concat(
            F.col('party_name'), F.col('address1'), F.col('address2'), F.col('address3'), F.col('address4'),
            F.col('city'), F.col('county'), F.col('state'), F.col('postal_code'), F.col('street_name'),
            F.col('street_number'), F.col('street_direction'), F.col('street_type')
        ))

In [10]:
cleaned = builder

In [11]:
words = Tokenizer(inputCol = 'concat', outputCol = 'all_tokenized').transform(cleaned)
hashed = HashingTF(inputCol = 'all_tokenized', outputCol = 'vector').transform(words)

In [12]:
hashed.take(1)

[Row(party_id=10698.0, parent_party_id=146850.0, party_name=u'MARUBENI AMERICA CORPORATION ', node_type=u'HQ', address1=u'375 LEXINGTON AVE ', address2=u'', address3=u'', address4=u'', city=u'NEW YORK ', county=u'NEW YORK ', state=u'NY ', province=None, postal_code=u'10017 ', postal_code_extn=u'3914', country_code=u'US', street_name=u'LEXINGTON ', street_number=u'375 ', street_direction=u'', street_type=u'AVE ', geo_valid_status=u'GEO_VALID', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2002-11-11 22:12:45.0', end_date=u'4712-12-31 00:00:00.0', program_id=46063.0, request_id=103145420.0, created_by=1116.0, last_updated_by=316142.0, creation_date=u'2002-11-11 22:12:45.0', last_update_date=u'2014-02-15 05:43:29.0', certified_date=u'2014-02-15 05:43:29.0', site_expl_id=374542503.0, conflict_batch_id=51527995.0, sa_member_id=922275.0, parent_sa_member_id=-1.0, party_level=1.0, link_party_id=10698.0, link_party_type=u'ORGANIZATION', split_pct=100.0, sales_acct_i

In [13]:
hashed.select(['party_id', 'vector']).write.saveAsTable("ignite.savm_hashed")