In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
import nltk

In [2]:
sqlContext.sql("set spark.sql.shuffle.partitions=50")

DataFrame[key: string, value: string]

In [32]:
cr = sqlContext.sql('select * from ignite.cr_parsed').repartition(50).cache()
savm = sqlContext.sql('select * from ignite.savm_parsed').repartition(100).cache()

In [4]:
cr.take(1)

[Row(party_id=192656253.0, parent_party_id=146753444.0, party_name=u'OSK SECURITIES THAILAND PCL', node_type=u'BR', address1=u'10TH FLOOR', address2=None, address3=None, address4=None, city=u'BANGKOK', county=None, state=u'BANGKOK', province=None, postal_code=u'10500', postal_code_extn=None, country_code=u'TH', street_name=None, street_number=None, street_direction=None, street_type=None, geo_valid_status=u'GEO_UNCERTIFIED_COUNTRY', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2012-07-19 01:30:58.0', end_date=u'4712-12-31 00:00:01.0', program_id=None, request_id=None, created_by=307958.0, last_updated_by=307958.0, creation_date=u'2012-07-19 01:30:58.0', last_update_date=u'2012-07-19 01:36:57.0', certified_date=u'2012-07-19 01:30:58.0', cleaned_name=u'OSK SECURITIES THAILAND PCL', tokenized_name=[u'osk', u'securities', u'thailand', u'pcl'])]

In [22]:
def apply_function(df, fields, function):
    column_objects = []
    for column in df.columns:
        if column in fields:
            column_objects.append(function(column))
        else:
            column_objects.append(column)
    return df.select(column_objects)

def clean_name(name):
    if name == None:
        return ""
    name = name.replace("-", " ").replace(",", " ").replace("-", " ").replace("/", " ").replace("(", " ").replace(")", " ")
    name = name.replace('. ', " ").lower()
    return ' '.join(name.split())

def tokenize(column):
    return F.udf(clean_name, StringType())(F.col(column)).alias(column)

def remove_extra_spaces(column):
    return F.udf(lambda x : ' '.join(x.split()), StringType())(F.col(column)).alias(column)

def full_concatenation(space_delims):
    full_list = []
    for elems in space_delims.split(" "):
        full_list += elems
    return full_list

In [15]:
string_fields = ['party_name', 'address1', 'address2', 'address3', 'address4', 'city', 'county', 
                              'state', 'postal_code', 'street_name', 'street_number', 'street_direction', 'street_type']


In [23]:
builder = apply_function(cr, string_fields, tokenize)
builder = builder.withColumn("concat", F.concat(
            F.col('party_name'), F.lit(" "), F.col('address1'), F.lit(" "), F.col('address2'), F.lit(" "), 
            F.col('address3'), F.lit(" "), F.col('address4'), F.lit(" "), 
            F.col('city'), F.lit(" "), F.col('county'), F.lit(" "), F.col('state'), F.lit(" "), F.col('postal_code'), 
            F.col('street_name'),F.lit(" "), 
            F.col('street_number'), F.lit(" "), F.col('street_direction'), F.lit(" "), F.col('street_type')
        ))
builder = apply_function(builder, 'concat', remove_extra_spaces)

In [24]:
cleaned = builder
words = Tokenizer(inputCol = 'concat', outputCol = 'all_tokenized').transform(cleaned)
hashed = HashingTF(inputCol = 'all_tokenized', outputCol = 'vector').transform(words)

In [26]:
hashed.take(1)

[Row(party_id=192656253.0, parent_party_id=146753444.0, party_name=u'osk securities thailand pcl', node_type=u'BR', address1=u'10th floor', address2=u'', address3=u'', address4=u'', city=u'bangkok', county=u'', state=u'bangkok', province=None, postal_code=u'10500', postal_code_extn=None, country_code=u'TH', street_name=u'', street_number=u'', street_direction=u'', street_type=u'', geo_valid_status=u'GEO_UNCERTIFIED_COUNTRY', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2012-07-19 01:30:58.0', end_date=u'4712-12-31 00:00:01.0', program_id=None, request_id=None, created_by=307958.0, last_updated_by=307958.0, creation_date=u'2012-07-19 01:30:58.0', last_update_date=u'2012-07-19 01:36:57.0', certified_date=u'2012-07-19 01:30:58.0', cleaned_name=u'OSK SECURITIES THAILAND PCL', tokenized_name=[u'osk', u'securities', u'thailand', u'pcl'], concat=u'osk securities thailand pcl 10th floor bangkok bangkok 10500', all_tokenized=[u'osk', u'securities', u'thailand', u'pc

In [28]:
hashed.select(['party_id', 'vector']).write.saveAsTable("ignite.cr_hashed", mode = 'overwrite')

In [33]:
builder = apply_function(savm, string_fields, tokenize)
builder = builder.withColumn("concat", F.concat(
            F.col('party_name'), F.lit(" "), F.col('address1'), F.lit(" "), F.col('address2'), F.lit(" "), 
            F.col('address3'), F.lit(" "), F.col('address4'), F.lit(" "), 
            F.col('city'), F.lit(" "), F.col('county'), F.lit(" "), F.col('state'), F.lit(" "), F.col('postal_code'), 
            F.col('street_name'),F.lit(" "), 
            F.col('street_number'), F.lit(" "), F.col('street_direction'), F.lit(" "), F.col('street_type')
        ))
builder = apply_function(builder, 'concat', remove_extra_spaces)

In [34]:
cleaned = builder
words = Tokenizer(inputCol = 'concat', outputCol = 'all_tokenized').transform(cleaned)
hashed = HashingTF(inputCol = 'all_tokenized', outputCol = 'vector').transform(words)

In [35]:
hashed.select(['party_id', 'vector']).write.saveAsTable("ignite.savm_hashed", mode = 'overwrite')