In [1]:
import os; import sys; import re

# common spark import
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType

# connect to spark if we haven't already
if not 'spark' in locals():
  spark = SparkSession.builder \
      .master("local[*]") \
      .appName('development') \
      .config("spark.sql.debug.maxToStringFields", str(1024 * 1024)) \
      .getOrCreate()
  sc = spark.sparkContext

print("Connected to Spark!")

Connected to Spark!


In [2]:
df = spark.read.option("header", "true").csv("address-linkage-key/address_link/data/test/*medium*.gz")
df.limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit
0,,Circle,Dr,,,,Fort Morgan,CO,80701,3419,S,525 Circle Dr,,25,0
1,,Primrose,Ave,,,,Vista,CA,92083,8032,S,2312 Primrose Ave,,12,2
2,,Gardenstone,Cir,,,,Tallmadge,OH,44278,1085,S,849 Gardenstone Cir,,49,8
3,,Briarwood,Dr,,,,Crestwood,KY,40014,9019,S,7511 Briarwood Dr,,11,0
4,S,15th,St,,,,Saint Clair,MI,48079,5203,S,1071 S 15th St,,71,4
5,,Front,St,,Apt,4B,Brooklyn,NY,11201,1223,M,206 Front St,Apt 4B,42,1
6,,Millpond,Rd,,,,Elizabeth Cty,NC,27909,7551,S,1295 Millpond Rd,,95,1
7,,Glenwood,Ln,,,,East Meadow,NY,11554,3719,S,479 Glenwood Ln,,79,8
8,NW,9th,St,,Apt,105,Miami,FL,33125,3443,M,2150 NW 9th St,Apt 105,30,9
9,,10th,St,NE,,,Naples,FL,34120,2057,S,460 10th St NE,,60,0


In [3]:
df = df.withColumn('dpc', f.lit(None))

In [4]:
df = df.withColumn('housenumber', (f.regexp_extract(f.col('address_line_1'),'(^[0-9]([0-9A-Z.*-]+)?)', 1)))
df.limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,dpc,housenumber
0,,Circle,Dr,,,,Fort Morgan,CO,80701,3419,S,525 Circle Dr,,25,0,,525
1,,Primrose,Ave,,,,Vista,CA,92083,8032,S,2312 Primrose Ave,,12,2,,2312
2,,Gardenstone,Cir,,,,Tallmadge,OH,44278,1085,S,849 Gardenstone Cir,,49,8,,849
3,,Briarwood,Dr,,,,Crestwood,KY,40014,9019,S,7511 Briarwood Dr,,11,0,,7511
4,S,15th,St,,,,Saint Clair,MI,48079,5203,S,1071 S 15th St,,71,4,,1071
5,,Front,St,,Apt,4B,Brooklyn,NY,11201,1223,M,206 Front St,Apt 4B,42,1,,206
6,,Millpond,Rd,,,,Elizabeth Cty,NC,27909,7551,S,1295 Millpond Rd,,95,1,,1295
7,,Glenwood,Ln,,,,East Meadow,NY,11554,3719,S,479 Glenwood Ln,,79,8,,479
8,NW,9th,St,,Apt,105,Miami,FL,33125,3443,M,2150 NW 9th St,Apt 105,30,9,,2150
9,,10th,St,NE,,,Naples,FL,34120,2057,S,460 10th St NE,,60,0,,460


In [6]:
#rule1
df = df.withColumn('dpc',
                   f.when(
                       f.col('address_line_2').isNull() &
                       f.col('housenumber').isNotNull() &
                       f.col('housenumber').rlike('^[0-9]*$'),
                       f.col('housenumber').substr(-2,2)
))
df.limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,dpc,housenumber
0,,Circle,Dr,,,,Fort Morgan,CO,80701,3419,S,525 Circle Dr,,25,0,25.0,525
1,,Primrose,Ave,,,,Vista,CA,92083,8032,S,2312 Primrose Ave,,12,2,12.0,2312
2,,Gardenstone,Cir,,,,Tallmadge,OH,44278,1085,S,849 Gardenstone Cir,,49,8,49.0,849
3,,Briarwood,Dr,,,,Crestwood,KY,40014,9019,S,7511 Briarwood Dr,,11,0,11.0,7511
4,S,15th,St,,,,Saint Clair,MI,48079,5203,S,1071 S 15th St,,71,4,71.0,1071
5,,Front,St,,Apt,4B,Brooklyn,NY,11201,1223,M,206 Front St,Apt 4B,42,1,,206
6,,Millpond,Rd,,,,Elizabeth Cty,NC,27909,7551,S,1295 Millpond Rd,,95,1,95.0,1295
7,,Glenwood,Ln,,,,East Meadow,NY,11554,3719,S,479 Glenwood Ln,,79,8,79.0,479
8,NW,9th,St,,Apt,105,Miami,FL,33125,3443,M,2150 NW 9th St,Apt 105,30,9,,2150
9,,10th,St,NE,,,Naples,FL,34120,2057,S,460 10th St NE,,60,0,60.0,460


In [7]:
df.filter(df.address_line_1.contains("Box")).limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,dpc,housenumber
0,,PO Box,,,,,Andover,KS,67002,901,U,901 PO Box,,1,4,1,901
1,,PO Box,,,,,Olive Branch,MS,38654,2104,S,1992 PO Box,,92,6,92,1992
2,,PO Box,,,,,Sweetser,IN,46987,108,U,108 PO Box,,8,9,8,108
3,,PO Box,,,,,Hanover,PA,17331,745,S,745 PO Box,,45,0,45,745
4,,PO Box,,,,,Monrovia,MD,21770,212,U,212 PO Box,,12,5,12,212
5,,PO Box,,,,,Perrin,TX,76486,177,S,177 PO Box,,77,0,77,177
6,,PO Box,,,,,Estill,SC,29918,1351,U,1351 PO Box,,51,5,51,1351
7,,PO Box,,,,,Hattiesburg,MS,39404,7214,U,17214 PO Box,,14,1,14,17214
8,,PO Box,,,,,Fayette,MS,39069,293,U,293 PO Box,,93,7,93,293
9,,PO Box,,,,,Winterville,NC,28590,1249,S,1249 PO Box,,49,7,49,1249


In [6]:
#rule2
df = df.withColumn('dpc', 
                   f.when((f.col('dpc').isNull()) & (f.col('housenumber').isNull()), 
                          f.lit('99'))
                   .otherwise(f.col('dpc'))
                  )
df.limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,dpc,housenumber
0,,Circle,Dr,,,,Fort Morgan,CO,80701,3419,S,525 Circle Dr,,25,0,25.0,525
1,,Primrose,Ave,,,,Vista,CA,92083,8032,S,2312 Primrose Ave,,12,2,12.0,2312
2,,Gardenstone,Cir,,,,Tallmadge,OH,44278,1085,S,849 Gardenstone Cir,,49,8,49.0,849
3,,Briarwood,Dr,,,,Crestwood,KY,40014,9019,S,7511 Briarwood Dr,,11,0,11.0,7511
4,S,15th,St,,,,Saint Clair,MI,48079,5203,S,1071 S 15th St,,71,4,71.0,1071
5,,Front,St,,Apt,4B,Brooklyn,NY,11201,1223,M,206 Front St,Apt 4B,42,1,,206
6,,Millpond,Rd,,,,Elizabeth Cty,NC,27909,7551,S,1295 Millpond Rd,,95,1,95.0,1295
7,,Glenwood,Ln,,,,East Meadow,NY,11554,3719,S,479 Glenwood Ln,,79,8,79.0,479
8,NW,9th,St,,Apt,105,Miami,FL,33125,3443,M,2150 NW 9th St,Apt 105,30,9,,2150
9,,10th,St,NE,,,Naples,FL,34120,2057,S,460 10th St NE,,60,0,60.0,460


In [7]:
#172
df.filter(df.dpc == '1').count()

172

In [8]:
df.filter(df.dpc == '01').count()

2117

In [9]:
#df.filter(df.dpc == '1').limit(10).toPandas()
df.filter(f.length('dpc') == 1).limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,dpc,housenumber
0,,Purdue,Cir,,,,Little Rock,AR,72204,5949,S,1 Purdue Cir,,1,7,1,1
1,,Freeman,Ln,,,,Poquoson,VA,23662,1906,S,8 Freeman Ln,,8,7,8,8
2,,Rolph Park,Ct,,,,Crockett,CA,94525,1414,S,2 Rolph Park Ct,,2,3,2,2
3,,Garlor,Dr,,,,Havertown,PA,19083,1214,S,1 Garlor Dr,,1,0,1,1
4,,Heaton,Cir,,,,Franklin,MA,2038,3368,S,4 Heaton Cir,,4,3,4,4
5,,Mockingbird,Ct,,,,Waterford,NY,12188,1509,S,2 Mockingbird Ct,,2,3,2,2
6,,Hopkins,St,,,,Unadilla,NY,13849,2304,S,8 Hopkins St,,8,8,8,8
7,,Beacham,Pl,,,,Hingham,MA,2043,3331,S,2 Beacham Pl,,2,9,2,2
8,,Pleasant Hill,Rd,,,,Falmouth,ME,4105,1965,S,6 Pleasant Hill Rd,,6,3,6,6
9,,Sweitzer,Dr,,,,Duke Center,PA,16729,9504,S,6 Sweitzer Dr,,6,1,6,6


In [None]:
#rule 3 with lpad
#df = df.withColumn('dpc', f.lpad('dpc', 2, '0'))
#df.limit(20).toPandas()

In [13]:
#rule 3 with lpad and more logic
#only lpad if dpc is not length=2
df = df.withColumn('dpc', 
                   f.when(
                       f.length('dpc') == 2, 
                       f.col('dpc'))
                   .otherwise( 
                       f.lpad('dpc', 2, '0')))
df.limit(20).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,dpc,housenumber
0,,Circle,Dr,,,,Fort Morgan,CO,80701,3419,S,525 Circle Dr,,25,0,25.0,525.0
1,,Primrose,Ave,,,,Vista,CA,92083,8032,S,2312 Primrose Ave,,12,2,12.0,2312.0
2,,Gardenstone,Cir,,,,Tallmadge,OH,44278,1085,S,849 Gardenstone Cir,,49,8,49.0,849.0
3,,Briarwood,Dr,,,,Crestwood,KY,40014,9019,S,7511 Briarwood Dr,,11,0,11.0,7511.0
4,S,15th,St,,,,Saint Clair,MI,48079,5203,S,1071 S 15th St,,71,4,71.0,1071.0
5,,Front,St,,Apt,4B,Brooklyn,NY,11201,1223,M,206 Front St,Apt 4B,42,1,,206.0
6,,Millpond,Rd,,,,Elizabeth Cty,NC,27909,7551,S,1295 Millpond Rd,,95,1,95.0,1295.0
7,,Glenwood,Ln,,,,East Meadow,NY,11554,3719,S,479 Glenwood Ln,,79,8,79.0,479.0
8,NW,9th,St,,Apt,105,Miami,FL,33125,3443,M,2150 NW 9th St,Apt 105,30,9,,2150.0
9,,10th,St,NE,,,Naples,FL,34120,2057,S,460 10th St NE,,60,0,60.0,460.0


In [20]:
df.filter(f.length('dpc') == 1).limit(10).toPandas()

Unnamed: 0,predirection,streetname,streetsuffix,postdirection,unitdesignator,unitdesignatornumber,cityname,state,zipcode,zip_4,dwellingtype,address_line_1,address_line_2,expected_dpc,expected_check_digit,dpc,housenumber


In [16]:
df.filter(df.dpc == '1').count()

0

In [17]:
#2289
df.filter(df.dpc == '01').count()

2289

df = df.withColumn('dpc', f.when(f.col('dpc').isNotNull(), f.col('dpc')).otherwise(
    whatever you want to do
)

In [None]:
#rule3 mod
df = df.withColumn('dpc',
                   f.when(
                       f.col('housenumber').rlike('\d{2}'),
                       f.col('dpc'))
                   .otherwise(f.concat(f.lit('0'), f.col('dpc')))
                  )

In [None]:
#rule3 
#when house number contains a single digit, add a leading zero
#when housenumber contains two digits, leave existing dpc, otherwise add a leading 0
df = df.withColumn('dpc',
                   f.when(
                       f.col('dpc').rlike('\d{2}'),
                       f.col('dpc'))
                   .otherwise(f.concat(f.lit('0'), f.col('dpc')))
                  )
df.filter(df.housenumber == '1').limit(10).toPandas()

In [None]:
df.limit(30).toPandas()

In [None]:
df.filter(df.housenumber.rlike('^\d{1}$')).limit(10).toPandas()

In [None]:
df.filter(df.housenumber.rlike('\d{2}')).limit(10).toPandas()

In [None]:
df.filter(df.dpc == '01' & df.housenumber == '1').count()

In [None]:
df.limit(30).toPandas()

In [None]:
# rule 3
df = df.withColumn('dpc',
                   f.when(
                       f.col('dpc').rlike('\d{2,}'),f.col('dpc'))
                          .otherwise(f.lit('test'))
                         )


df = df.withColumn('dpc', 
                   f.when((f.col('housenumber') == "1")),
                         f.regexp_extract(f.col('housenumber'),'(\d{2,})', 1)
                   .otherwise(f.col('dpc'))
                  )