In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

import pandas as pd
import numpy as np
from IPython.display import display

from collections import defaultdict, Counter
from sklearn import metrics
from datetime import datetime

In [2]:
# consider consecutive dots, i.e. K.F.C.
def clean_name(name):
    name = name.replace("-", " ").replace(",", " ").replace("-", " ").replace("/", " ").replace("(", " ").replace(")", " ")
    name = name.replace('. ', " ").lower()
    return ' '.join(name.split())

def apply_function(df, fields, function):
    column_objects = []
    for column in df.columns:
        if column in fields:
            column_objects.append(function(column))
        else:
            column_objects.append(column)
    return df.select(column_objects)

max_date = pd.datetime.today()

def parse_datetime(column):
    return F.udf(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S.0"), DateType())(F.col(column)).alias(column)

In [28]:
cr = sqlContext.sql("select * from edso_ignite.contest_cr").repartition(100).cache()
builder = cr.withColumn('cleaned_name', F.udf(clean_name, StringType())(F.col('party_name')))
builder = Tokenizer(inputCol = 'cleaned_name', outputCol = 'tokenized_name').transform(builder)
builder = apply_function(builder, ['creation_date', 'last_update_date'], parse_datetime)
builder.write.saveAsTable("ignite.cr_parsed", mode = 'overwrite')

In [4]:
builder = sqlContext.sql("select * from edso_ignite.contest_savm").repartition(100).cache()
builder = apply_function(builder, ['creation_date', 'last_update_date'], parse_datetime)
builder.take(10)

[Row(party_id=1052351.0, parent_party_id=42499.0, party_name=u'PIZZA HUT, INC.', node_type=u'BR', address1=u'7491 SAINT ANDREWS RD', address2=None, address3=None, address4=None, city=u'IRMO', county=u'RICHLAND', state=u'SC', province=None, postal_code=u'29063', postal_code_extn=u'2857', country_code=u'US', street_name=u'SAINT ANDREWS', street_number=u'7491', street_direction=None, street_type=u'RD', geo_valid_status=u'GEO_VALID', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2002-11-12 04:56:43.0', end_date=u'4712-12-31 00:00:00.0', program_id=None, request_id=None, created_by=1116.0, last_updated_by=-1.0, creation_date=datetime.date(2002, 11, 12), last_update_date=datetime.date(2012, 12, 26), certified_date=u'2012-12-26 04:01:31.0', site_expl_id=479699542.0, conflict_batch_id=87615898.0, sa_member_id=3755144.0, parent_sa_member_id=-1.0, party_level=2.0, link_party_id=42499.0, link_party_type=u'ORGANIZATION', split_pct=50.0, sales_acct_id=276652661.0, operat

In [8]:
savm = sqlContext.sql("select * from edso_ignite.contest_savm").repartition(100).cache()
builder = savm.withColumn('cleaned_name', F.udf(clean_name, StringType())(F.col('party_name')))
builder = Tokenizer(inputCol = 'cleaned_name', outputCol = 'tokenized_name').transform(builder)
builder = apply_function(builder, ['creation_date', 'last_update_date'], parse_datetime)
builder.write.saveAsTable("ignite.savm_parsed", mode = 'overwrite')

In [10]:
sqlContext.sql('select * from ignite.savm_parsed').take(1)

[Row(party_id=1049197.0, parent_party_id=28360.0, party_name=u'K F C NATIONAL MANAGEMENT COMPANY', node_type=u'BR', address1=u'321 E FORDHAM RD', address2=None, address3=None, address4=None, city=u'BRONX', county=u'BRONX', state=u'NY', province=None, postal_code=u'10458', postal_code_extn=u'5051', country_code=u'US', street_name=u'FORDHAM', street_number=u'321', street_direction=u'E', street_type=u'RD', geo_valid_status=u'GEO_VALID', completenes_status=u'COMPLETE', cleansed_status=u'CLEANSED', start_date=u'2002-11-12 04:50:50.0', end_date=u'4712-12-31 00:00:00.0', program_id=None, request_id=None, created_by=1116.0, last_updated_by=332355.0, creation_date=datetime.date(2002, 11, 12), last_update_date=datetime.date(2014, 11, 16), certified_date=u'2011-03-22 16:40:02.0', site_expl_id=479693272.0, conflict_batch_id=87615874.0, sa_member_id=3755141.0, parent_sa_member_id=-1.0, party_level=2.0, link_party_id=28360.0, link_party_type=u'ORGANIZATION', split_pct=50.0, sales_acct_id=276652661.0