# SC DOT Data and the US Census - How has traffic in South Carolina changed?

Reasons for looking at data

Where the data came from 

What the data looks like 

In [1]:
# import required libraries
import pandas as pd
import numpy as np
from simpledbf import Dbf5
import os

PyTables is not installed. No support for HDF output.


In [2]:
# change into the directory where the shp files live
os.chdir("./shp_files")
# verify we're in the right place
os.getcwd()

'/home/james/PYTHON/PythonProjects/Upstate/shp_files'

In [3]:
# read GIS dbf data into dataframes, one file for each year between 2009 and 2018
shp_dfs = {}
for root, dirs, files in os.walk(os.getcwd()):
    for file in files:
        if file.endswith(".dbf"):
            # print(file.split('.')[0])
            dbf = Dbf5(os.path.join(root, file))
            df = dbf.to_dataframe()
            shp_dfs[file.split('.')[0]] = df

In [4]:
# check if the columns in the dfs match - convert columns to sets and check set intersection
col_sets = map(lambda x: set(x.columns), shp_dfs.values())

In [5]:
# unpack the list of column sets into set.intersection, which returns common elements in set
common_cols = set.intersection(*col_sets)

In [6]:
# check to see what's common between the dfs
common_cols

{'ID1'}

In [7]:
#uh oh! there's only 1 column common between all. Let's check to see what the actual columns are named and how many there are
for df in shp_dfs.values():
    print(df.columns, len(df.columns))

Index(['CountyNumb', 'RouteTypeN', 'RouteType1', 'RouteNumbe', 'MeterMileP',
       'BeginMileP', 'EndMilePoi', 'StationNum', 'Termini', 'FactoredAA',
       'FactoredA1', 'MapLRS', 'Status1', 'ID1', 'RouteAuxil', 'CountyName',
       'Long', 'Lat'],
      dtype='object') 18
Index(['Station_Nu', 'Route_LRS', 'County_ID', 'Route_Type', 'Route_Numb',
       'Route_Auxi', 'Descriptio', 'Count', 'Year', 'ID1'],
      dtype='object') 10
Index(['STATION_NU', 'MILE_POINT', 'ROUTE_LRS', 'MAP_TYPE', 'LATITUDE',
       'LONGITUDE', 'COUNTY_ID', 'ROUTE_TYPE', 'ROUTE_NUMB', 'ROUTE_AUX',
       'COUNT', 'YEAR', 'DESCRIPTIO', 'ID1', 'GMRotation'],
      dtype='object') 15
Index(['CountyName', 'RouteTypeN', 'RouteNumbe', 'RouteAuxil', 'MeterMileP',
       'BegiNMileP', 'EndMilePoi', 'StationNum', 'Termini', 'FactoredAA',
       'FactoredA1', 'MapLRS', 'Status1', 'ID1', 'Latitude', 'Longitude'],
      dtype='object') 16
Index(['STATION', 'MILE_POINT', 'ROUTE_LRS', 'MAP_TYPE', 'ID1', 'LATITUDE',
      

In [8]:
# maybe we'll get better results if we do some simple string formatting first
for df in shp_dfs.values():
    df.columns = [c.replace('_', '').lower().strip() for c in df.columns]

In [9]:
# check set intersection again
col_sets = list(map(lambda x: set(x.columns), shp_dfs.values()))
set.intersection(*col_sets)

{'id1'}

In [10]:
for year, df in shp_dfs.items():
    print(year, df.columns, len(df.columns))

2018 Index(['countynumb', 'routetypen', 'routetype1', 'routenumbe', 'metermilep',
       'beginmilep', 'endmilepoi', 'stationnum', 'termini', 'factoredaa',
       'factoreda1', 'maplrs', 'status1', 'id1', 'routeauxil', 'countyname',
       'long', 'lat'],
      dtype='object') 18
2013 Index(['stationnu', 'routelrs', 'countyid', 'routetype', 'routenumb',
       'routeauxi', 'descriptio', 'count', 'year', 'id1'],
      dtype='object') 10
2010 Index(['stationnu', 'milepoint', 'routelrs', 'maptype', 'latitude',
       'longitude', 'countyid', 'routetype', 'routenumb', 'routeaux', 'count',
       'year', 'descriptio', 'id1', 'gmrotation'],
      dtype='object') 15
2016 Index(['countyname', 'routetypen', 'routenumbe', 'routeauxil', 'metermilep',
       'beginmilep', 'endmilepoi', 'stationnum', 'termini', 'factoredaa',
       'factoreda1', 'maplrs', 'status1', 'id1', 'latitude', 'longitude'],
      dtype='object') 16
2015 Index(['station', 'milepoint', 'routelrs', 'maptype', 'id1', 'latitude'

In [11]:
# dang. Still only 1 match. Time to do some brute force mapping.
col_mapping_dict = {
    **dict.fromkeys(['station', 'stationnu', 'stationnum'], 'station_id'),
    **dict.fromkeys(['latitude', 'lat'], 'latitude'),
    **dict.fromkeys(['longitude', 'long'], 'longitude'), 
    **dict.fromkeys(['aadtyr', 'year', 'factored1', 'factoreda1'], 'year'),
    **dict.fromkeys(['routelrs', 'maplrs'], 'route_identifier'),
    **dict.fromkeys(['termini', 'descriptio'], 'route_leg_descrip'),
    # **dict.fromkeys(['beginmilep', 'beginmile'], 'route_leg_beginmile'),
    # **dict.fromkeys(['endmilepo', 'endmilepoi'], 'route_leg_endmile'),
    **dict.fromkeys(['routetype', 'rtetype', 'routetypen'], 'route_type_id'),   # has to be a numeric column as well, some collision here
    **dict.fromkeys(['rtenum', 'rtenumb', 'routenumb', 'routenum', 'routenumbe'], 'route_number'),
    **dict.fromkeys(['county', 'countyname', 'countynam'], 'county_name'),
    # **dict.fromkeys(['countyid', 'countynumb'], 'county_id'),
    **dict.fromkeys(['aadt', 'factoreda', 'count', 'factoredaa'], 'average_daily_traffic'),
    **dict.fromkeys(['id1'], 'row_number')
}
col_mapping_dict

{'station': 'station_id',
 'stationnu': 'station_id',
 'stationnum': 'station_id',
 'latitude': 'latitude',
 'lat': 'latitude',
 'longitude': 'longitude',
 'long': 'longitude',
 'aadtyr': 'year',
 'year': 'year',
 'factored1': 'year',
 'factoreda1': 'year',
 'routelrs': 'route_identifier',
 'maplrs': 'route_identifier',
 'termini': 'route_leg_descrip',
 'descriptio': 'route_leg_descrip',
 'routetype': 'route_type_id',
 'rtetype': 'route_type_id',
 'routetypen': 'route_type_id',
 'rtenum': 'route_number',
 'rtenumb': 'route_number',
 'routenumb': 'route_number',
 'routenum': 'route_number',
 'routenumbe': 'route_number',
 'county': 'county_name',
 'countyname': 'county_name',
 'countynam': 'county_name',
 'aadt': 'average_daily_traffic',
 'factoreda': 'average_daily_traffic',
 'count': 'average_daily_traffic',
 'factoredaa': 'average_daily_traffic',
 'id1': 'row_number'}

In [12]:
# rename columns as per mapping dict
shp_dfs_renamed = {year: df.rename(columns=col_mapping_dict) for year, df in shp_dfs.items()}
# drop columns not mapped
shp_dfs_renamed = {year: df.drop([c for c in df.columns if c not in col_mapping_dict.values()], axis=1) for year, df in shp_dfs_renamed.items()}
# drop any duplicated columns
shp_dfs_renamed = {year: df.loc[:, ~df.columns.duplicated()] for year, df in shp_dfs_renamed.items()}

In [13]:
# rename some columns that aren't right
shp_dfs_renamed['2010'].rename(columns={'route_type_id':'route_type'}, inplace=True)
shp_dfs_renamed['2016'].rename(columns={'route_type_id':'route_type'}, inplace=True)
shp_dfs_renamed['2015'].rename(columns={'route_type_id':'route_type'}, inplace=True)
shp_dfs_renamed['2017'].rename(columns={'route_type_id':'route_type'}, inplace=True)
shp_dfs_renamed['2011'].rename(columns={'route_type_id':'route_type'}, inplace=True)
shp_dfs_renamed['2014'].rename(columns={'route_type_id':'route_type'}, inplace=True)
shp_dfs_renamed['2017'].rename(columns={'county_name':'county_id'}, inplace=True)
shp_dfs_renamed['2009'].drop('county_name', axis=1, inplace=True)
shp_dfs_renamed['2012'].drop('county_name', axis=1, inplace=True)

In [14]:
# records should be unique across station_id, route_identifier, and route_number - do some verification
for year, df in shp_dfs_renamed.items():
    temp_df = df.groupby(['station_id', 'route_identifier', 'route_number']).size()
    print(year,'\n', temp_df.loc[temp_df > 1.0])

2018 
 station_id  route_identifier  route_number
101.0       32020000100N      1.0             2
133.0       28020000100N      1.0             2
173.0       10020007800E      78.0            2
192.0       30040006600E      66.0            3
            36040006600E      66.0            2
278.0       30090028900N      289.0           2
279.0       15070003600E      36.0            2
306.0       24070005000E      50.0            2
321.0       21070013200E      132.0           2
411.0       14070010400E      104.0           2
413.0       02070004600E      46.0            2
415.0       13090015900N      159.0           2
426.0       10070004600E      46.0            2
429.0       10070010300N      103.0           2
463.0       39070030400E      304.0           2
492.0       16070016400E      164.0           2
495.0       02070018200E      182.0           2
498.0       04070009800E      98.0            2
509.0       02070030900N      309.0           2
551.0       42090056300N      563.0   

In [15]:
# we can spot check some of these to see what's up with the records
def check_records(year, station_id, route_identifier):
    mask = (shp_dfs_renamed[year]['station_id'] == station_id) & (shp_dfs_renamed[year]['route_identifier'] == route_identifier)
    return shp_dfs_renamed[year].loc[mask]

check_records('2014', 969.0, '42090019100N')

Unnamed: 0,station_id,route_identifier,row_number,latitude,longitude,county_name,route_type,route_number,route_leg_descrip,year,average_daily_traffic
10190,969.0,42090019100N,6535,35:1:59.272,-81:55:35.874,Spartanburg,S-,191.0,"S- 191, S- 781 TO S- 56",2014.0,1450
10191,969.0,42090019100N,6535,35:1:59.272,-81:55:35.874,Spartanburg,L-,191.0,"S- 191, S- 781 TO S- 56",2014.0,1450
10192,969.0,42090019100N,6535,35:1:59.272,-81:55:35.874,Spartanburg,L-,191.0,"S- 191, S- 781 TO S- 56",2014.0,1450


In [16]:
# eyeballing the records reveals that they are duplicated rows. we'll take the first from every group
for year, df in shp_dfs_renamed.items():
    temp_df = df.groupby(['station_id', 'route_identifier', 'route_number']).head(1)
    shp_dfs_renamed[year] = temp_df

In [17]:
#check for dupes again
for year, df in shp_dfs_renamed.items():
    temp_df = df.groupby(['station_id', 'route_identifier', 'route_number']).size()
    print(year,'\n', temp_df.loc[temp_df > 1.0])

2018 
 Series([], dtype: int64)
2013 
 Series([], dtype: int64)
2010 
 Series([], dtype: int64)
2016 
 Series([], dtype: int64)
2015 
 Series([], dtype: int64)
2009 
 Series([], dtype: int64)
2017 
 Series([], dtype: int64)
2012 
 Series([], dtype: int64)
2011 
 Series([], dtype: int64)
2014 
 Series([], dtype: int64)


In [18]:
# verify we took the first row in the duplicated group
check_records('2014', 969.0, '42090019100N')

Unnamed: 0,station_id,route_identifier,row_number,latitude,longitude,county_name,route_type,route_number,route_leg_descrip,year,average_daily_traffic
10190,969.0,42090019100N,6535,35:1:59.272,-81:55:35.874,Spartanburg,S-,191.0,"S- 191, S- 781 TO S- 56",2014.0,1450


In [19]:
# set index of all dfs to the unique identifiers
for year, df in shp_dfs_renamed.items():
    temp_df = df.set_index(['station_id', 'route_identifier', 'route_number'])
    shp_dfs_renamed[year] = temp_df

In [32]:
# fill missing year value in any dfs
for year, df in shp_dfs_renamed.items():
    df['year'] = df.year.fillna(year)

In [33]:
# 2018 and 2017 dfs don't have all the columns, so use the 2016 df to standardize fields
# check how many nas in each columns in the 2016 df
shp_dfs_renamed['2016'].isna().sum()

county_name                0
route_type                 0
route_leg_descrip          0
average_daily_traffic      0
year                       0
row_number                 0
latitude                 153
longitude                153
dtype: int64

In [34]:
# update 2016 latitudes/longitudes with 2018 latitudes/longitudes where stationid, route_id, and route_number match (index of each df)

shp_dfs_renamed['2016'].update(shp_dfs_renamed['2018'][['latitude', 'longitude']])

# still some nulls - try the 2015 df
shp_dfs_renamed['2016'].update(shp_dfs_renamed['2015'][['latitude', 'longitude']])

In [35]:
# check nas again
shp_dfs_renamed['2016'].isna().sum()

county_name                0
route_type                 0
route_leg_descrip          0
average_daily_traffic      0
year                       0
row_number                 0
latitude                 153
longitude                153
dtype: int64

In [36]:
# 2018 and 2017 dfs don't have all the columns, so use the 2016 df to standardize fields
# we're standardizing data across the five following columns
cols_to_update = ['route_type', 'route_leg_descrip', 'latitude', 'longitude', 'county_name']
update_df = shp_dfs_renamed['2016'][cols_to_update]

for year, df in shp_dfs_renamed.items():
    df.update(update_df)

In [37]:
# stack all the data frames together
traffic_df = pd.concat(shp_dfs_renamed.values(), sort=True, axis=0).reset_index()

In [38]:
# eyeball the data 
traffic_df.head()

Unnamed: 0,station_id,route_identifier,route_number,average_daily_traffic,county_id,county_name,latitude,longitude,route_leg_descrip,route_type,route_type_id,row_number,year
0,101.0,01020017800E,178.0,4300.0,,ABBEVILLE,34.41979,-82.38521,County Line - ANDERSON TO S- 166 (DRAKE RD),,2.0,1,2018
1,103.0,01020017800E,178.0,4600.0,,ABBEVILLE,34.38344,-82.35325,S- 166 (DRAKE RD) TO SC 184 (N MAIN ST),,2.0,2,2018
2,105.0,01020017800E,178.0,3600.0,,ABBEVILLE,34.37099,-82.33765,SC 184 (N MAIN ST) TO County Line - GREENWOOD,,2.0,3,2018
3,109.0,01040002000E,20.0,4900.0,,ABBEVILLE,34.17888,-82.38016,"SC 203 (WASHINGTON ST), L- 20, L- 980 TO SC 71",,4.0,4,2018
4,111.0,01040002000E,20.0,2200.0,,ABBEVILLE,34.18359,-82.38115,SC 71 TO L- 170,,4.0,5,2018


In [39]:
# drop columns I actually don't want
traffic_df = traffic_df.drop(['county_id', 'route_type_id', 'row_number'], axis=1)

In [40]:
traffic_df.head()

Unnamed: 0,station_id,route_identifier,route_number,average_daily_traffic,county_name,latitude,longitude,route_leg_descrip,route_type,year
0,101.0,01020017800E,178.0,4300.0,ABBEVILLE,34.41979,-82.38521,County Line - ANDERSON TO S- 166 (DRAKE RD),,2018
1,103.0,01020017800E,178.0,4600.0,ABBEVILLE,34.38344,-82.35325,S- 166 (DRAKE RD) TO SC 184 (N MAIN ST),,2018
2,105.0,01020017800E,178.0,3600.0,ABBEVILLE,34.37099,-82.33765,SC 184 (N MAIN ST) TO County Line - GREENWOOD,,2018
3,109.0,01040002000E,20.0,4900.0,ABBEVILLE,34.17888,-82.38016,"SC 203 (WASHINGTON ST), L- 20, L- 980 TO SC 71",,2018
4,111.0,01040002000E,20.0,2200.0,ABBEVILLE,34.18359,-82.38115,SC 71 TO L- 170,,2018


In [41]:
# now fill in cols that are still na by unique id - NOT the average traffic column (main data we care about)
# fill nas by group
cols_to_fill = ['county_name', 'latitude', 'longitude', 'route_leg_descrip', 'route_type']
for col in cols_to_fill:
    traffic_df[col] = traffic_df.groupby(['station_id', 'route_identifier', 'route_number'])[col].ffill().bfill()

In [42]:
# check for any nas remaining
traffic_df.isna().sum()

station_id                0
route_identifier          0
route_number              0
average_daily_traffic    69
county_name               0
latitude                  0
longitude                 0
route_leg_descrip         0
route_type                0
year                      0
dtype: int64

In [43]:
# drop remaining nas - no data!
traffic_df = traffic_df.dropna()

In [44]:
# get the pct change year over year for average daily traffic
traffic_df['traffic_yearly_pct_change'] = traffic_df \
    .sort_values(['station_id', 'route_identifier', 'route_number', 'year']) \
    .groupby(['station_id', 'route_identifier', 'route_number']) \
    .average_daily_traffic \
    .pct_change()

In [45]:
traffic_df = traffic_df.sort_values(['station_id', 'route_identifier', 'route_number', 'year'])

In [46]:
traffic_df[traffic_df['traffic_yearly_pct_change'] > 0.2]

Unnamed: 0,station_id,route_identifier,route_number,average_daily_traffic,county_name,latitude,longitude,route_leg_descrip,route_type,year,traffic_yearly_pct_change
46302,100.0,04020002900N,29.0,4000.0,ANDERSON,34.35590,-82.81412,State Line - GEORGIA TO SC 187 (HIGHWAY 187 S),US,2015,0.290323
69660,100.0,08020001702N,17.0,53100.0,BERKELEY,33.03701,-80.15366,County Line - DORCHESTER TO I- 26,US,2017,0.288835
107867,100.0,28040001200E,12.0,4100.0,KERSHAW,34.11999,-80.77968,County Line - RICHLAND TO S- 47 (FORT JACKSON RD),SC,2014,0.413793
46303,101.0,04020002900N,29.0,2100.0,ANDERSON,34.40918,-82.79075,SC 187 (HIGHWAY 187 S) TO US 29 BUS (HIGHWAY ...,US,2015,0.312500
35410,101.0,07020001700N,17.0,12400.0,BEAUFORT,32.64004,-80.85637,County Line - JASPER TO US 17 ALT (CASTLE HALL...,US,2016,0.441860
...,...,...,...,...,...,...,...,...,...,...,...
50675,2435.0,23010018500N,185.0,5000.0,GREENVILLE,34.77331,-82.44549,SC 153 (153 HWY) TO I- 85,I-,2015,0.250000
95757,2439.0,23010018500N,185.0,16700.0,GREENVILLE,34.80352,-82.42446,"US 25 (WHITE HORSE RD) TO , SC 20",I-,2011,0.590476
9271,2489.0,42010058500S,585.0,32200.0,SPARTANBURG,34.97494,-81.94188,"US 176 CO2 (N CHURCH ST), SC 9 TO US 221 (WHIT...",SC,2018,0.201493
9272,2491.0,42010058500S,585.0,31200.0,SPARTANBURG,34.97158,-81.93761,US 221 (WHITNEY RD) TO US 176 (N PINE ST),SC,2018,0.214008


In [48]:
os.chdir('..')

In [96]:
os.getcwd()

sales_df = pd.read_csv('Sale_Counts_Zip.csv')

In [97]:
sales_df.head()

Unnamed: 0,RegionID,RegionName,StateName,SizeRank,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,...,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,seasAdj
0,61639,10025,New York,1,,,,,,,...,76.0,33.0,47.0,56.0,35.0,70.0,78.0,66.0,63.0,0
1,84654,60657,Illinois,2,,,,,,,...,91.0,77.0,113.0,157.0,189.0,165.0,186.0,141.0,152.0,0
2,61637,10023,New York,3,,,,,,,...,80.0,45.0,63.0,45.0,66.0,85.0,79.0,90.0,95.0,0
3,91982,77494,Texas,4,56.0,71.0,84.0,95.0,116.0,86.0,...,86.0,112.0,186.0,218.0,200.0,204.0,245.0,226.0,,0
4,84616,60614,Illinois,5,,,,,,,...,75.0,85.0,144.0,163.0,219.0,209.0,204.0,196.0,173.0,0


In [98]:
sales_df = sales_df.loc[sales_df.StateName == 'South Carolina']
sales_df = sales_df.drop(['RegionID', 'StateName', 'SizeRank'], axis=1)
sales_df.rename(columns={'RegionName': 'ZipCode'}, inplace=True)
sales_df = sales_df.set_index('ZipCode')

In [99]:
sales_df.head()

Unnamed: 0_level_0,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,2008-09,2008-10,2008-11,2008-12,...,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,seasAdj
ZipCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29732,,,,,,,,,,,...,75.0,75.0,134.0,140.0,125.0,136.0,128.0,138.0,151.0,0
29072,63.0,89.0,75.0,70.0,76.0,65.0,68.0,47.0,42.0,43.0,...,88.0,88.0,126.0,132.0,192.0,156.0,180.0,169.0,142.0,0
29730,,,,,,,,,,,...,71.0,60.0,91.0,95.0,116.0,111.0,87.0,100.0,107.0,0
29464,58.0,78.0,73.0,88.0,81.0,73.0,57.0,69.0,49.0,35.0,...,63.0,91.0,94.0,138.0,116.0,135.0,146.0,141.0,,0
29681,,,,,,,,,,,...,99.0,118.0,150.0,129.0,165.0,136.0,151.0,150.0,161.0,0


In [100]:
# stack all years of sales into one column
sales_df = sales_df.stack().reset_index()
sales_df.columns = ['ZipCode', 'YearMonth', 'Sales']
sales_df


Unnamed: 0,ZipCode,YearMonth,Sales
0,29732,2014-08,127.0
1,29732,2014-09,145.0
2,29732,2014-10,124.0
3,29732,2014-11,121.0
4,29732,2014-12,124.0
...,...,...,...
51654,820,2019-06,0.0
51655,820,2019-07,0.0
51656,820,2019-08,0.0
51657,820,2019-09,0.0


In [103]:
sales_df['Year'] = sales_df.YearMonth.apply(lambda x: x.split('-')[0])
sales_df.head()

Unnamed: 0,ZipCode,YearMonth,Sales,Year
0,29732,2014-08,127.0,2014
1,29732,2014-09,145.0,2014
2,29732,2014-10,124.0,2014
3,29732,2014-11,121.0,2014
4,29732,2014-12,124.0,2014


In [105]:
yearly_sales = sales_df.groupby(['ZipCode', 'Year']).Sales.sum()

Zip to Lat/Long xref

In [107]:
zip_xref = pd.read_csv('us-zip-code-latitude-and-longitude.csv')

In [108]:
zip_xref

Unnamed: 0,Zip,City,State,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint,Unnamed: 8
0,29607,Greenville,SC,34.825592,-82.34099,-5,1,34.825592,-82.34099
1,29164,Wagener,SC,33.659078,-81.40845,-5,1,33.659078,-81.40845
2,29325,Clinton,SC,34.470115,-81.86761,-5,1,34.470115,-81.86761
3,29520,Cheraw,SC,34.688620,-79.92315,-5,1,34.688620,-79.92315
4,29615,Greenville,SC,34.866801,-82.31739,-5,1,34.866801,-82.31739
...,...,...,...,...,...,...,...,...,...
549,29592,Sellers,SC,34.283207,-79.47272,-5,1,34.283207,-79.47272
550,29646,Greenwood,SC,34.169781,-82.15474,-5,1,34.169781,-82.15474
551,29142,Santee,SC,33.462378,-80.50903,-5,1,33.462378,-80.50903
552,29449,Hollywood,SC,32.715745,-80.26738,-5,1,32.715745,-80.26738


In [111]:
merged_sales = yearly_sales.reset_index().merge(zip_xref, how='left', left_on='ZipCode', right_on='Zip')

In [116]:
merged_sales = merged_sales.loc[merged_sales.ZipCode.apply(lambda x: len(str(x)) >= 5)]

In [119]:
merged_sales = merged_sales.drop(['Zip', 'Timezone', 'Daylight savings time flag', 'geopoint', 'Unnamed: 8'], axis=1)

In [123]:
merged_sales = merged_sales.loc[merged_sales.Year != 'seasAdj']

In [124]:
merged_sales

Unnamed: 0,ZipCode,Year,Sales,City,State,Latitude,Longitude
13,29001,2008,10.0,Alcolu,SC,33.769930,-80.17278
14,29001,2009,10.0,Alcolu,SC,33.769930,-80.17278
15,29001,2010,11.0,Alcolu,SC,33.769930,-80.17278
16,29001,2011,8.0,Alcolu,SC,33.769930,-80.17278
17,29001,2012,9.0,Alcolu,SC,33.769930,-80.17278
...,...,...,...,...,...,...,...
4902,29945,2015,26.0,Yemassee,SC,32.681058,-80.83348
4903,29945,2016,14.0,Yemassee,SC,32.681058,-80.83348
4904,29945,2017,23.0,Yemassee,SC,32.681058,-80.83348
4905,29945,2018,17.0,Yemassee,SC,32.681058,-80.83348


In [None]:
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km