In [1]:
import os; import sys; import re

# common spark import
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType

# connect to spark if we haven't already
if not 'spark' in locals():
  spark = SparkSession.builder \
      .master("local[*]") \
      .appName('development') \
      .config("spark.sql.debug.maxToStringFields", str(1024 * 1024)) \
      .getOrCreate()
  sc = spark.sparkContext

print("Connected to Spark!")

Connected to Spark!


In [2]:
df = spark.read.option("header", "true").csv("address-linkage-key/address_link/data/test/*small*.gz")
df.limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit
0,S,Main,St,,,,Aspers,PA,17304,9802,S,259 S Main St,,59,2
1,,Joseph,Ave,,,,Westfield,MA,1085,1812,S,75 Joseph Ave,,75,2
2,,Algonquin,Way,,,,Orangevale,CA,95662,2342,S,8751 Algonquin Way,,51,5
3,N,Main,St,,Apt,2,Crown Point,IN,46307,3279,M,250 N Main St,Apt 2,2,7
4,,252nd,St,,,,Harbor City,CA,90710,2422,S,1136 252nd St,,36,4
5,,Agean,Ct,,,,Murrieta,CA,92562,2118,S,41484 Agean Ct,,84,2
6,W,43rd,St,,Apt,25E,New York,NY,10036,6473,M,350 W 43rd St,Apt 25E,55,0
7,,Muirfield,Run,,,,Norcross,GA,30093,6100,S,7302 Muirfield Run,,2,6
8,NW,37th,St,,,,Coral Springs,FL,33065,2782,S,11075 NW 37th St,,75,2
9,N,Point,Blvd,,Apt,903,Tallahassee,FL,32308,4181,M,1950 N Point Blvd,Apt 903,28,0


In [3]:
def blanks_to_null(x):
    return f.when(f.col(x) != "", f.col(x)).otherwise(None)

In [4]:
def house_number_extract(df):
    #make address_line_1 all uppercase
    df = df.withColumn('address_line_1', f.upper('address_line_1'))
    
    #extract house number or box number into column housenumber
    df = df.withColumn('housenumber',
                      f.when(
                          df.address_line_1.rlike('^[A-Z]{2}'),
                          f.regexp_extract(f.col('address_line_1'),'(BOX\\s)([0-9]+[0-9A-Z.*-]*)', 2))
                       .otherwise(f.regexp_extract(f.col('address_line_1'),'^([A-Z]*[0-9]+[0-9A-Z.*-]*)', 1)))
    return df

In [5]:
df = house_number_extract(df)
df.limit(30).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,housenumber
0,S,Main,St,,,,Aspers,PA,17304,9802,S,259 S MAIN ST,,59,2,259
1,,Joseph,Ave,,,,Westfield,MA,1085,1812,S,75 JOSEPH AVE,,75,2,75
2,,Algonquin,Way,,,,Orangevale,CA,95662,2342,S,8751 ALGONQUIN WAY,,51,5,8751
3,N,Main,St,,Apt,2,Crown Point,IN,46307,3279,M,250 N MAIN ST,Apt 2,2,7,250
4,,252nd,St,,,,Harbor City,CA,90710,2422,S,1136 252ND ST,,36,4,1136
5,,Agean,Ct,,,,Murrieta,CA,92562,2118,S,41484 AGEAN CT,,84,2,41484
6,W,43rd,St,,Apt,25E,New York,NY,10036,6473,M,350 W 43RD ST,Apt 25E,55,0,350
7,,Muirfield,Run,,,,Norcross,GA,30093,6100,S,7302 MUIRFIELD RUN,,2,6,7302
8,NW,37th,St,,,,Coral Springs,FL,33065,2782,S,11075 NW 37TH ST,,75,2,11075
9,N,Point,Blvd,,Apt,903,Tallahassee,FL,32308,4181,M,1950 N POINT BLVD,Apt 903,28,0,1950


In [6]:
# run blank function on 'housenumber' column to replace blanks with 'None'.
df = df.withColumn('housenumber', blanks_to_null('housenumber'))

In [7]:
#rule1
df = df.withColumn('dpc', 
        f.when(
            f.col('address_line_2').isNull() &
            f.col('housenumber').isNotNull() & 
            f.col('housenumber').rlike('^[0-9]*$'),
            f.col('housenumber').substr(-2,2)))
df.limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,housenumber,dpc
0,S,Main,St,,,,Aspers,PA,17304,9802,S,259 S MAIN ST,,59,2,259,59.0
1,,Joseph,Ave,,,,Westfield,MA,1085,1812,S,75 JOSEPH AVE,,75,2,75,75.0
2,,Algonquin,Way,,,,Orangevale,CA,95662,2342,S,8751 ALGONQUIN WAY,,51,5,8751,51.0
3,N,Main,St,,Apt,2,Crown Point,IN,46307,3279,M,250 N MAIN ST,Apt 2,2,7,250,
4,,252nd,St,,,,Harbor City,CA,90710,2422,S,1136 252ND ST,,36,4,1136,36.0
5,,Agean,Ct,,,,Murrieta,CA,92562,2118,S,41484 AGEAN CT,,84,2,41484,84.0
6,W,43rd,St,,Apt,25E,New York,NY,10036,6473,M,350 W 43RD ST,Apt 25E,55,0,350,
7,,Muirfield,Run,,,,Norcross,GA,30093,6100,S,7302 MUIRFIELD RUN,,2,6,7302,2.0
8,NW,37th,St,,,,Coral Springs,FL,33065,2782,S,11075 NW 37TH ST,,75,2,11075,75.0
9,N,Point,Blvd,,Apt,903,Tallahassee,FL,32308,4181,M,1950 N POINT BLVD,Apt 903,28,0,1950,


In [8]:
#rule5
df = df.withColumn('dpc', 
                   f.when(
                       f.col('address_line_2').isNull() &
                       f.col('housenumber').isNotNull() & 
                       f.col('dpc').isNull() &
                       #f.col('housenumber').rlike('^[a-zA-Z0-9]*$'),
                       f.col('housenumber').rlike('^[0-9]+[A-Z]+$'),
                       f.regexp_extract(f.col('housenumber'),'(\d+)',1).substr(-2,2))
                   .otherwise(f.col('dpc')))
df.limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,housenumber,dpc
0,S,Main,St,,,,Aspers,PA,17304,9802,S,259 S MAIN ST,,59,2,259,59.0
1,,Joseph,Ave,,,,Westfield,MA,1085,1812,S,75 JOSEPH AVE,,75,2,75,75.0
2,,Algonquin,Way,,,,Orangevale,CA,95662,2342,S,8751 ALGONQUIN WAY,,51,5,8751,51.0
3,N,Main,St,,Apt,2,Crown Point,IN,46307,3279,M,250 N MAIN ST,Apt 2,2,7,250,
4,,252nd,St,,,,Harbor City,CA,90710,2422,S,1136 252ND ST,,36,4,1136,36.0
5,,Agean,Ct,,,,Murrieta,CA,92562,2118,S,41484 AGEAN CT,,84,2,41484,84.0
6,W,43rd,St,,Apt,25E,New York,NY,10036,6473,M,350 W 43RD ST,Apt 25E,55,0,350,
7,,Muirfield,Run,,,,Norcross,GA,30093,6100,S,7302 MUIRFIELD RUN,,2,6,7302,2.0
8,NW,37th,St,,,,Coral Springs,FL,33065,2782,S,11075 NW 37TH ST,,75,2,11075,75.0
9,N,Point,Blvd,,Apt,903,Tallahassee,FL,32308,4181,M,1950 N POINT BLVD,Apt 903,28,0,1950,


In [9]:
#rule8

# create new column that selects the first word from the address_line_1 string
df = df.withColumn('alphas', (f.regexp_extract(f.col('address_line_1'),'(^[A-Z0-9]+[0-9]\\w)', 1)))
    
# update dpc when alphas contains a value, add_line_2 is null and dpc is null
df = df.withColumn('dpc', 
        f.when(
            f.col('alphas').isNotNull() &
            f.col('address_line_2').isNull() &
            f.col('dpc').isNull(),
            f.regexp_extract(f.col('alphas'),'([0-9]{1,2}$)',1)).otherwise(f.col('dpc')))

df = df.withColumn('dpc', blanks_to_null('dpc'))

In [11]:
#rule10
df = df.withColumn('dpc', 
            f.when(
                #This specifies that if 'dpc' is not null, then that value should be retained.
                f.col('dpc').isNotNull(),
                f.col('dpc'))
                             .otherwise(f.regexp_extract(f.col('housenumber'), '([0-9]+)([.*-])([0-9]+)', 3)))

df = df.withColumn('dpc', blanks_to_null('dpc'))

df.limit(30).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,housenumber,dpc,alphas
0,S,Main,St,,,,Aspers,PA,17304,9802,S,259 S MAIN ST,,59,2,259,59.0,259.0
1,,Joseph,Ave,,,,Westfield,MA,1085,1812,S,75 JOSEPH AVE,,75,2,75,75.0,
2,,Algonquin,Way,,,,Orangevale,CA,95662,2342,S,8751 ALGONQUIN WAY,,51,5,8751,51.0,8751.0
3,N,Main,St,,Apt,2,Crown Point,IN,46307,3279,M,250 N MAIN ST,Apt 2,2,7,250,,250.0
4,,252nd,St,,,,Harbor City,CA,90710,2422,S,1136 252ND ST,,36,4,1136,36.0,1136.0
5,,Agean,Ct,,,,Murrieta,CA,92562,2118,S,41484 AGEAN CT,,84,2,41484,84.0,41484.0
6,W,43rd,St,,Apt,25E,New York,NY,10036,6473,M,350 W 43RD ST,Apt 25E,55,0,350,,350.0
7,,Muirfield,Run,,,,Norcross,GA,30093,6100,S,7302 MUIRFIELD RUN,,2,6,7302,2.0,7302.0
8,NW,37th,St,,,,Coral Springs,FL,33065,2782,S,11075 NW 37TH ST,,75,2,11075,75.0,11075.0
9,N,Point,Blvd,,Apt,903,Tallahassee,FL,32308,4181,M,1950 N POINT BLVD,Apt 903,28,0,1950,,1950.0


In [12]:
#rule13
df = df.withColumn('dpc', f.coalesce(f.col('dpc'), f.lit('99')))

In [13]:
#rule3
df = df.withColumn('dpc', f.lpad('dpc', 2, '0'))
df.limit(30).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,housenumber,dpc,alphas
0,S,Main,St,,,,Aspers,PA,17304,9802,S,259 S MAIN ST,,59,2,259,59,259.0
1,,Joseph,Ave,,,,Westfield,MA,1085,1812,S,75 JOSEPH AVE,,75,2,75,75,
2,,Algonquin,Way,,,,Orangevale,CA,95662,2342,S,8751 ALGONQUIN WAY,,51,5,8751,51,8751.0
3,N,Main,St,,Apt,2,Crown Point,IN,46307,3279,M,250 N MAIN ST,Apt 2,2,7,250,99,250.0
4,,252nd,St,,,,Harbor City,CA,90710,2422,S,1136 252ND ST,,36,4,1136,36,1136.0
5,,Agean,Ct,,,,Murrieta,CA,92562,2118,S,41484 AGEAN CT,,84,2,41484,84,41484.0
6,W,43rd,St,,Apt,25E,New York,NY,10036,6473,M,350 W 43RD ST,Apt 25E,55,0,350,99,350.0
7,,Muirfield,Run,,,,Norcross,GA,30093,6100,S,7302 MUIRFIELD RUN,,2,6,7302,2,7302.0
8,NW,37th,St,,,,Coral Springs,FL,33065,2782,S,11075 NW 37TH ST,,75,2,11075,75,11075.0
9,N,Point,Blvd,,Apt,903,Tallahassee,FL,32308,4181,M,1950 N POINT BLVD,Apt 903,28,0,1950,99,1950.0
