# Standardizing street addresses
This script will normalize the addresses in the MoCo car crash data as much as possible and make it easier to identify common intersections.
MAKE SURE TO RUN `CAR CRASH DATA CLEANING` FIRST 

In [44]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)

I'll start by cleaning one of the spreadsheets, the 2022 data, for speed purposes before applying the cleaning functions to the master spreadsheet. 

In [46]:
df = pd.read_csv('source-data/moco-crash-2022.csv', 
                 usecols=['Collision Date','Roadway Id','Intersecting Road'],
                 dtype={'Roadway Id': 'string'},
                 parse_dates=['Collision Date']
                )
df

Unnamed: 0,Collision Date,Roadway Id,Intersecting Road
0,2022-01-07,I69N,STATE RD 37
1,2022-01-08,SR46W,DEER PARK
2,2022-01-17,W REEVES,
3,2022-01-04,THIRD,S HAWTHORNE
4,2022-01-01,S HENDERSON,E HILLSIDE
...,...,...,...
2245,2022-09-17,W COUNTRY CLUB DR,S OLD STATE ROAD 37
2246,2022-09-15,E 10TH ST,E 10TH ST
2247,2022-08-20,S COLLEGE MALL RD,E BUICK CADILLAC BLVD
2248,2022-06-09,S OLD SR 37,MORMAN RD


In [47]:
# replace NA values with empty strings to make cleaning easier
df['Roadway Id'] = df['Roadway Id'].fillna('')
df['Intersecting Road'] = df['Intersecting Road'].fillna('')

In [48]:
def replace_str(df, str1, str2):
    return df.replace(to_replace=str1, value=str2, regex=True)

In [5]:
# identify strings to manually replace 
strs_to_replace = [
    ['BLOOMINGTON IN',''],
    ['BLOOMINGTON, IN',''],
    ['S\. ', 'S '],
    ['N\. ', 'N '],
    ['W\. ', 'W '],
    ['E\.','E'],
    ['SOUTH ', 'S '],
    ['NORTH ', 'N '],
    ['WEST ', 'W '],
    ['EAST ', 'E '],
    [' AVE\.', 'AVE'],
    ['AVENUE', 'AVE'],
    ['STREET','ST'],
    ['ST\.','ST'],
    ['PIKE', 'PK'],
    ['ROAD','RD'],
    ['STATE RD','ST RD'],
    ['SR4','SR 4'],
    ['THIRD', '3RD'],
    [' 47401',''],
    ['SR ','ST RD '],
    ['46W', '46 W'],
    ['45W', '45 W'],
    ['STATE 46','ST RD 46'],
    ['S\.R\.','ST RD'],
    ['I 69','I-69'],
    ['I69','I-69'],
    ['INTERSTATE 69','I-69'],
    ['I-69 SOUTH','I-69 S'],
    ['SBOUND','S'],
    ['S BOUND','S'],
    ['S I-69','I-69 S'],
    ['I-69N','I-69 N'],
    ['JORDAN AVE','EAGLESON AVE'],
    ['JORDAN','EAGLESON AVE'],
    ['DRIVE','DR'],
    ['LANE','LN'],
    ['IN-45','ST RD 45'],
    ['W ST RD 45/46 BYPASS','W ST RD 45/46'],
    ['WMAIN','W MAIN'],
    ['ROGERS RD','ROGERS ST'],
    ['BLK ',''],
    ['N JORDAN','N EAGLESON AVE'],
    ['S JORDAN','S EAGLESON AVE'],
    ['E3RD','E 3RD'],
    ['W3RD','W 3RD'],
    ['BLOCK ',''],
    ['STRE','ST'],
    ['E E','E '],
    ['3RD AVE','3RD ST'],
    ['PARKING LOT',''],
    ['ST ST','ST'],
    ['2ND AVE','2ND ST'],
    ['W2ND','W 2ND'],
    ['E2ND','E 2ND'],
    ['E10TH','E 10TH'],
    ['45-46','45/46'],
    ['SR37','ST RD 37'],
    ['SR37S','S ST RD 37'],
    ['OLD SR37','ST RD 37'],
    ['SR37N','N ST RD 37'],
    ['OLD ST RD','ST RD'],
    ['37 BUSINESS','37'],
    ['37 HWY','37'],
    ['HWY 37','ST RD 37'],
    ['OLDSR37','ST RD 37'],
    ['OLD 37','ST RD 37'],
    ['ST RD 37S','S ST RD 37'],
    ['ST RD 37N','N ST RD 37'],
    ['37 RD','37'],
    ['ST RD 37 S','S ST RD 37'],
    ['ST RD 37 N','N ST RD 37'],
    ['OLST RD 37','ST RD 37'],
    ['BUSINESS 37','ST RD 37'],
    ['US37','ST RD 37'],
    ['ST RD 37 S HWY','S ST RD 37'],
    ['ST RD 37 N RD','N ST RD 37'],
    
]

In [49]:
clean_df = df

for string in strs_to_replace:
    clean_df = replace_str(clean_df,string[0],string[1])

In [7]:
# extract the `101` from `101 E 2ND ST` and create a new column
def address_num(road):
    address_num_exists = False
    if road:
        # get the first word of the road name
        first_word = road.split(" ")[0]
        # get the length of the road name. if there's only one number, we don't want to identify that as an address num.
        road_len = len(road.split(" "))
        if road_len > 1:
            # if the first word is all numerals, the address num exists
            address_num_exists = bool(re.search("^\d+$",first_word))
    # if it exists, return the address num. if not, return an empty string.
    return first_word if address_num_exists else ''


In [8]:
# All entries like `101 E 2ND ST` should be `E 2ND ST`
def extract_address_num(road):
    address_num_exists = False
    if road:
        first_word = road.split(" ")[0]
        road_len = len(road.split(" "))
        if road_len > 1:
            address_num_exists = bool(re.search("^\d+$",first_word))
    #         if the address num exists, remove it from the original address. else, return original address.
    return " ".join(road.split(" ")[1:]) if address_num_exists else road

In [9]:
# All entries like `S 17TH` should be `S 17TH ST`
def number_streets(road):
    if road:
        if bool(re.search('[\d]{1,2}(TH|ST|ND|RD)$',road.strip())):
            road = road.strip() + " ST"
#         if bool(re.search('^[\d]{1,2}(TH|ST|ND|RD)$',road)):
#             road = 
    return road.strip()

In [10]:
number_streets('E 3RD ')

'E 3RD ST'

In [11]:
# remove info after semicolons or colons
def colons(road):
    if road:
        road = road.split(";")[0]
        road = road.split(":")[0]
    return road 

In [12]:
clean_df['Address Number'] = clean_df['Roadway Id'].apply(address_num)
clean_df['Roadway Id'] = clean_df['Roadway Id'].apply(extract_address_num)
clean_df['Roadway Id'] = clean_df['Roadway Id'].apply(colons)

In [13]:
clean_df

Unnamed: 0,Collision Date,Roadway Id,Intersecting Road,Address Number
0,2022-01-07,I-69 N,ST RD 37,
1,2022-01-08,ST RD 46 W,DEER PARK,
2,2022-01-17,W REEVES,,
3,2022-01-04,3RD,S HAWTHORNE,
4,2022-01-01,S HENDERSON,E HILLSIDE,
...,...,...,...,...
2245,2022-09-17,W COUNTRY CLUB DR,S ST RD 37,
2246,2022-09-15,E 10TH ST,E 10TH ST,
2247,2022-08-20,S COLLEGE MALL RD,E BUICK CADILLAC BLVD,
2248,2022-06-09,S ST RD 37,MORMAN RD,


In [14]:
clean_df.sort_values('Roadway Id')['Roadway Id'].unique()

array(['', '10TH', '10TH ST', '1110', '17TH', '17TH ST', '1ST',
       "3200' S OF SMITHVILLE RD", '3RD', '3RD ST', '45', '45/46 BYPASS',
       '46', '69', '7TH', '7TH ST', 'ACCESS DR TO HILLTOP GARDENS',
       'ALEXANDER DR', 'ALLEN ST', 'ATWATER AVE', 'BAYLES',
       'BEECHWOOD DR', 'BLK. S.OLIVE ST', 'BLOOMFIELD RD', 'CATHERINE ST',
       'CENTENNIAL DR', 'CHEEKWOOD LN', 'CHURCH LN',
       'CLEAR CREEK TRAILHEAD NORTH', 'COLLEGE MALL RD', 'CONNAUGHT',
       'COUNTRY CLUB', 'CURRY PK', 'DANIELS WAY', 'DISCOVERY PARKWAY',
       'DISCOVERY PKWY', 'DITTEMORE RD', 'DORCHESTER DR', 'E 10TH ST',
       'E 11TH ST', 'E 12TH ST', 'E 13TH ST', 'E 14TH ST', 'E 15TH ST',
       'E 16TH ST', 'E 17TH', 'E 17TH ST', 'E 19TH ST', 'E 1ST',
       'E 2ND ST', 'E 3RD', 'E 3RD ST', 'E 4TH ST', 'E 6TH ST',
       'E 7TH ST', 'E 8TH ST', 'E 9TH ST', 'E ALLEN ST', 'E ATWATER',
       'E ATWATER AVE', 'E AUTO MALL RD', 'E BETHEL LN',
       'E BILL MALLORY BLVD', 'E BUICK CADILLAC BLVD', 'E BURKS DR

In [15]:
# clean_df.value_counts('Roadway Id')[:20]
clean_df

Unnamed: 0,Collision Date,Roadway Id,Intersecting Road,Address Number
0,2022-01-07,I-69 N,ST RD 37,
1,2022-01-08,ST RD 46 W,DEER PARK,
2,2022-01-17,W REEVES,,
3,2022-01-04,3RD,S HAWTHORNE,
4,2022-01-01,S HENDERSON,E HILLSIDE,
...,...,...,...,...
2245,2022-09-17,W COUNTRY CLUB DR,S ST RD 37,
2246,2022-09-15,E 10TH ST,E 10TH ST,
2247,2022-08-20,S COLLEGE MALL RD,E BUICK CADILLAC BLVD,
2248,2022-06-09,S ST RD 37,MORMAN RD,


In [16]:
clean_df['Intersection Id'] = clean_df['Roadway Id'] + ' | ' + clean_df['Intersecting Road']

In [17]:
clean_df['Intersection Id'].value_counts()[:30]

I-69 |                       60
E 3RD ST |                   37
W ST RD 45 |                 23
S COLLEGE MALL RD |          20
W 3RD ST |                   19
S ST RD 37 |                 18
N EAGLESON AVE |             15
ST RD 46 |                   15
E 10TH ST |                  14
S WALNUT ST |                12
W 3RD ST | S LIBERTY DR      11
N WALNUT ST |                11
N KINSER PK | ST RD 46       10
ST RD 45 | E 17TH ST          9
ST RD 45 | WALKWAY            9
ST RD 46 | E 3RD ST           9
S LIBERTY DR |                9
ST RD 46 | N KINSER PK        8
ST RD 45 | E 10TH ST          8
W 3RD ST | S CURRY PK         8
ST RD 45 |                    7
W 3RD ST | ST RD 37           7
S LIBERTY DR | W 3RD ST       7
E 17TH ST |                   7
I-69 S |                      7
E 3RD ST | S PETE LLIS DR     7
ST RD 37 |                    7
W ST RD 46 |                  7
ST RD 46 | EASTGATE LN        7
E 3RD ST | S FESS AVE         6
Name: Intersection Id, dtype: int64

# Applying the cleaning script to the master csv

In [18]:
master = pd.read_csv('data_output/master_crash.csv', 
                 usecols=['DateTime','Roadway Id','Intersecting Road', 'Vehicles Involved','Number Injured','Number Dead','Latitude','Longitude','Primary Factor','Reported_Location'],
#                  dtype={'Roadway Id': 'string'},
                 parse_dates=['DateTime']
                )
master

  master = pd.read_csv('data_output/master_crash.csv',


Unnamed: 0,Vehicles Involved,Number Injured,Number Dead,Roadway Id,Intersecting Road,Latitude,Longitude,Primary Factor,DateTime,Reported_Location
0,1.0,0.0,0.0,I69N,STATE RD 37,38.329723,-86.509226,ANIMAL/OBJECT IN ROADWAY,2022-01-07 05:14:00,
1,1.0,0.0,0.0,SR46W,DEER PARK,39.212153,-86.587526,ANIMAL/OBJECT IN ROADWAY,2022-01-08 08:35:00,
2,1.0,0.0,0.0,W REEVES,,39.235012,-86.676553,RAN OFF ROAD RIGHT,2022-01-17 07:33:00,
3,2.0,0.0,0.0,THIRD,S HAWTHORNE,39.156888,-86.520324,UNSAFE LANE MOVEMENT,2022-01-04 12:32:00,
4,2.0,0.0,0.0,S HENDERSON,E HILLSIDE,39.150640,-86.526960,FAILURE TO YIELD RIGHT OF WAY,2022-01-01 05:33:00,
...,...,...,...,...,...,...,...,...,...,...
74617,,0.0,0.0,,,0.000000,0.000000,IMPROPER LANE USAGE,2003-10-06 17:00:00,DUNN & WHITE LOT WEST
74618,,0.0,0.0,,,0.000000,0.000000,UNSAFE SPEED,2003-11-03 08:00:00,RED OAK & SR446
74619,,0.0,0.0,,,0.000000,0.000000,BRAKE FAILURE OR DEFECTIVE,2003-12-05 12:00:00,2ND ST & WALNUT
74620,,0.0,0.0,,,0.000000,0.000000,UNSAFE BACKING,2003-12-01 07:00:00,NINETH & NORTH


For the 2003-2012 data, there is the added complexity of the `Reported_Location` column, which was formatted differently than later data. Let's separate that into the existing `Roadway Id` and `Intersecting Road` columns so that data can also be cleaned the same way.

In [19]:
master['Reported_Location'] = master['Reported_Location'].fillna('')
master['Roadway Id'] = master['Roadway Id'].fillna('')
master['Intersecting Road'] = master['Intersecting Road'].fillna('')

In [20]:
def get_year(date):
    return pd.to_datetime(date).year

In [21]:
master['Year'] = master['DateTime'].apply(get_year)

In [22]:
def split(row):
    return row.split("&")[0]
def split1(row):
    return row.split("&")[1] if len(row.split("&")) > 1 else ''

In [23]:
master['Reported_Location'].apply(split).sample(10)

27247             
20764             
67                
48004    FAIRVIEW 
5664              
42722    EASTGATE 
57101     HARMONY 
72090       3RD ST
66925     10TH ST 
36841      DUNLAP 
Name: Reported_Location, dtype: object

In [24]:
master['Roadway Id'] = (master['Roadway Id'] + master['Reported_Location'].apply(split))

In [25]:
master['Intersecting Road'] = (master['Intersecting Road'] + master['Reported_Location'].apply(split1))

In [26]:
master[master['Year'] < 2013].sample(10)

Unnamed: 0,Vehicles Involved,Number Injured,Number Dead,Roadway Id,Intersecting Road,Latitude,Longitude,Primary Factor,DateTime,Reported_Location,Year
45388,,0.0,0.0,BUSINESS 37 RD,STONE MILL,0.0,0.0,FOLLOWING TOO CLOSELY,2010-11-01 17:00:00,BUSINESS 37 RD & STONE MILL,2010.0
59575,,1.0,0.0,3RD,COLLEGE AVE,39.16464,-86.534816,IMPROPER TURNING,2006-06-05 23:00:00,3RD & COLLEGE AVE,2006.0
49989,,0.0,0.0,SR4546,WALNUT ST,39.186368,-86.5344,DRIVER DISTRACTED - EXPLAIN IN NARRATIVE,2008-11-02 17:00:00,SR4546 & WALNUT ST,2008.0
34533,,0.0,0.0,9TH,N COLLEGE AVE,39.170544,-86.534768,RAN OFF ROAD RIGHT,2012-09-05 11:00:00,9TH & N COLLEGE AVE,2012.0
48577,,0.0,0.0,FAIRINGTON,LEONARD SPRINGS RD,39.136144,-86.582496,FAILURE TO YIELD RIGHT OF WAY,2009-06-07 15:00:00,FAIRINGTON & LEONARD SPRINGS RD,2009.0
43320,,0.0,0.0,CURRY,WEST 3RD ST,39.16448,-86.56015,FAILURE TO YIELD RIGHT OF WAY,2010-02-07 17:00:00,CURRY & WEST 3RD ST,2010.0
41325,,0.0,0.0,CURRY,SR37,0.0,0.0,ANIMAL/OBJECT IN ROADWAY,2011-11-03 21:00:00,CURRY & SR37,2011.0
41298,,0.0,0.0,SR446S,TOWER RIDGE,0.0,0.0,FAILURE TO YIELD RIGHT OF WAY,2011-10-02 12:00:00,SR446S & TOWER RIDGE,2011.0
52335,,0.0,0.0,BLOOMFIELD RD,CORY,39.1571,-86.55969,ALCOHOLIC BEVERAGES,2008-04-06 21:00:00,BLOOMFIELD RD & CORY,2008.0
71584,,0.0,0.0,7TH ST,,39.16864,-86.529486,OTHER (DRIVER) - EXPLAIN IN NARRATIVE,2003-07-06 09:00:00,7TH ST,2003.0


In [27]:
# replace NA values with empty strings to make cleaning easier
master['Roadway Id'] = master['Roadway Id'].fillna('')
master['Intersecting Road'] = master['Intersecting Road'].fillna('')

In [28]:
clean_master = master
for string in strs_to_replace:
    clean_master = replace_str(clean_master,string[0],string[1])

In [29]:
# remove all info in parentheses in street names
def remove_parens(row):
    return " ".join(re.sub('\(.*\)', '', row).split())

In [30]:
clean_master['Roadway Id'] = clean_master['Roadway Id'].apply(extract_address_num)
clean_master['Roadway Id'] = clean_master['Roadway Id'].apply(colons)
clean_master['Roadway Id'] = clean_master['Roadway Id'].apply(number_streets)
clean_master['Roadway Id'] = clean_master['Roadway Id'].apply(remove_parens)

In [31]:
master[master['Roadway Id'].str.contains('2ND')]['Roadway Id'].value_counts()[:20]

  2ND ST            588
2ND ST              446
2ND                 364
W 2ND ST             97
E 2ND ST             42
E 2ND ST             42
E 2ND                38
W 2ND                35
2ND ST               35
E 2ND                12
WEST 2ND ST          12
EAST 2ND ST          11
EAST 2ND ST           8
2ND ST ST             6
2ND SANDERS           6
2ND                   5
2ND STREET ST         3
2ND AVE SANDERS       3
EAST 2ND              3
2ND STREET            2
Name: Roadway Id, dtype: int64

In [32]:
clean_master[clean_master['Roadway Id'].str.contains('37')]['Roadway Id'].value_counts()[:20]

ST RD 37                          3861
S ST RD 37                         683
N ST RD 37                         506
W ST RD 37                          52
S ST RD 37 RD                       15
OLDST RD 37                          8
S ST RD 37 HWY                       8
ST RD 37S                            4
N ST RD 37 RD                        4
CR37S                                4
OFF RAMP FROM NB ST RD 37 RAMP       3
S OLD ST RD 37                       3
N S ST RD 37TATE                     3
S ST RD 37 RAMP                      3
State Road 37 45                     3
State Road 37 ramp                   3
N ST RD 37 BOUND                     2
State Road 37 Ramp                   2
ST RD 37W                            2
N ST RD 37 HWY                       2
Name: Roadway Id, dtype: int64

In [33]:
clean_master['Roadway Id'].value_counts()[:50]

3RD ST               5030
ST RD 37             3861
ST RD 46             2594
10TH ST              2454
ST RD 45             1791
17TH ST              1664
E 3RD ST             1662
ST RD 45/46          1501
2ND ST               1452
7TH ST                988
ST RD 48              743
S ST RD 37            683
E 10TH ST             642
W 3RD ST              611
CURRY PK              593
S WALNUT ST           553
KIRKWOOD AVE          545
11TH ST               536
EAGLESON AVE          526
WALNUT ST             510
N ST RD 37            506
ST RD 446             498
6TH ST                495
4TH ST                471
ATWATER AVE           462
8TH ST                426
COLLEGE MALL RD       422
1ST ST                411
ROGERS ST             380
N WALNUT ST           376
BLOOMFIELD RD         368
COLLEGE AVE           347
WALNUT                342
14TH ST               340
9TH ST                336
FAIRFAX RD            332
DUNN                  322
13TH ST               312
E 17TH ST   

In [34]:
clean_master_min = clean_master[['Number Injured','Number Dead','Roadway Id','Intersecting Road','Latitude','Longitude','Primary Factor','DateTime']]


In [35]:
clean_master_min.to_csv(r'./data_output/master_crash_clean.csv', index=False)

## A brief look at the intersections with the most crashes

In [36]:
clean_master = clean_master[clean_master['Roadway Id'] != '']

In [37]:
master_intersections = clean_master[clean_master['Intersecting Road'] != '']

In [38]:
# what proportion of master crashes with a Roadway Id listed had an Intersecting Road listed?
master_intersections.shape[0] / clean_master.shape[0]

0.8366524228009352

In [39]:
master_intersections[master_intersections['Roadway Id'].str.contains("EAGLESON")]

Unnamed: 0,Vehicles Involved,Number Injured,Number Dead,Roadway Id,Intersecting Road,Latitude,Longitude,Primary Factor,DateTime,Reported_Location,Year
22,2.0,0.0,0.0,N EAGLESON AVE,E LAW LN,39.173462,-86.515488,FOLLOWING TOO CLOSELY,2022-01-19 08:55:00,,2022.0
74,2.0,0.0,0.0,N EAGLESON AVE,E 10TH ST,39.171215,-86.515681,FAILURE TO YIELD RIGHT OF WAY,2022-04-19 18:40:00,,2022.0
102,2.0,0.0,0.0,S EAGLESON AVE,E ATWATER AVE,0.000000,0.000000,FAILURE TO YIELD RIGHT OF WAY,2022-02-27 12:30:00,,2022.0
103,1.0,1.0,0.0,N EAGLESON AVE,N FISHER CT,39.183754,-86.516011,IMPROPER PASSING,2022-05-27 16:00:00,,2022.0
112,2.0,0.0,0.0,N EAGLESON AVE,E 10TH ST,39.171810,-86.515652,UNSAFE LN MOVEMENT,2022-05-06 17:55:00,,2022.0
...,...,...,...,...,...,...,...,...,...,...,...
74530,,0.0,0.0,EAGLESON AVE,READ W,0.000000,0.000000,UNSAFE BACKING,2003-11-03 22:00:00,EAGLESON AVE & READ W,2003.0
74536,,1.0,0.0,EAGLESON AVE,LIBRARY LOT,0.000000,0.000000,OTHER (DRR) - EXPLAIN IN NARRATIVE,2003-12-05 11:00:00,EAGLESON AVE & LIBRARY LOT,2003.0
74572,,0.0,0.0,EAGLESON AVE,MUSICALS ARTS CENTER,0.000000,0.000000,UNSAFE BACKING,2003-03-06 11:00:00,EAGLESON AVE & MUSICALS ARTS CENTER,2003.0
74574,,0.0,0.0,EAGLESON AVE,KAPPA DEKTA,0.000000,0.000000,OTHER (DRR) - EXPLAIN IN NARRATIVE,2003-05-02 12:00:00,EAGLESON AVE & KAPPA DEKTA,2003.0


In [43]:
master_intersections['Intersecting Road'].value_counts()[:50]

WALNUT ST          1322
 WALNUT ST          961
 ST RD 45           893
 ST RD 37           892
 WALNUT             786
 ST RD 46           761
 EAGLESON AVE       708
COLLEGE AVE         655
 ST RD 45 W         632
 ST RD 46 W         593
 COLLEGE AVE        558
 ST RD 46E          529
 VERNAL             509
 S ST RD 37         481
 N ST RD 37         445
 WOODLAWN           407
EAGLESON AVE        379
 W 3RD ST           372
DUNN ST             371
 ROGERS             335
VERNAL PK           330
ST RD 45            328
 ROGERS ST          326
CURRY PK            305
INDIANA AVE         298
 INDIANA AVE        296
ST RD 37            290
ROGERS ST           284
LIBERTY DR          280
 LINCOLN            280
 INDIANA            275
 WASHINGTON         270
WOODLAWN AVE        263
ST RD 48            258
 COLLEGE            257
 3RD ST             253
ST RD 46            250
 ST RD 45E          246
10TH ST             245
 S WALNUT ST        233
 ST RD 48           225
TAPP RD         