In [133]:
import pandas as pd
import altair as alt
from sklearn.cluster import DBSCAN

In [134]:
alt.themes.enable('fivethirtyeight')

ThemeRegistry.enable('fivethirtyeight')

In [135]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [136]:
# this goes at the top of the notebook
def to_altair(x):    
    x.to_json('chart.json', orient='records', date_format='iso')
    return('chart.json')
pd.DataFrame.to_altair = to_altair # attach to DataFrame objects

In [137]:
# load the PLD for Q1
q1_df = pd.read_csv('data/202302 BIE Technical Interview PLD1.csv')

In [138]:
# initial look at data
q1_df.head()

Unnamed: 0,OrderCount,OriginZip,DestinationZip,Length,Width,Height,WeightOunces
0,1,12409,60197,12.0,9.0,1.0,6.0
1,2,49782,13144,9.0,8.0,6.0,24.0
2,1,42323,80920,12.0,9.0,1.0,6.0
3,1,17072,9499,12.0,9.0,5.0,67.0
4,1,49782,72068,9.0,6.0,4.0,11.0


In [139]:
# see shape of data
q1_df.shape

(1580937, 7)

In [140]:
# view datatypes
q1_df.dtypes

OrderCount          int64
OriginZip           int64
DestinationZip      int64
Length            float64
Width             float64
Height            float64
WeightOunces      float64
dtype: object

In [141]:
# convert zip codes to string so matplotlib recognizes it as categorical instead of numerical
q1_df['OriginZip'] = q1_df['OriginZip'].astype(str)
q1_df['DestinationZip'] = q1_df['DestinationZip'].astype(str)

In [142]:
# looks like leading 0s were dropped from zip codes at some point in exporting or loading the csv, let's fill in these leading 0s. Some are missing 2 leadings 0s and others only one.

print(len(q1_df[q1_df['DestinationZip'].str.len() == 3]), len(q1_df[q1_df['DestinationZip'].str.len() == 4]))

87 127817


In [143]:
# create function to apply to leading 0s to columns
def fill_leading_zeros(x):
    x = x.zfill(5)
    return x

In [144]:
q1_df['OriginZip'] = q1_df['OriginZip'].apply(fill_leading_zeros)
q1_df['DestinationZip'] = q1_df['DestinationZip'].apply(fill_leading_zeros)

In [145]:
q1_df

Unnamed: 0,OrderCount,OriginZip,DestinationZip,Length,Width,Height,WeightOunces
0,1,12409,60197,12.0,9.0,1.0,6.0
1,2,49782,13144,9.0,8.0,6.0,24.0
2,1,42323,80920,12.0,9.0,1.0,6.0
3,1,17072,09499,12.0,9.0,5.0,67.0
4,1,49782,72068,9.0,6.0,4.0,11.0
...,...,...,...,...,...,...,...
1580932,1,12409,40964,11.0,4.0,4.0,10.0
1580933,2,27152,14174,12.0,9.0,5.0,55.0
1580934,1,90061,07963,8.0,7.0,5.0,13.0
1580935,1,36039,93274,14.0,12.0,3.0,42.0


In [146]:
# view the distribution of origin zip codes
origin_zips = q1_df['OriginZip'].value_counts(ascending=False).reset_index()

In [147]:
origin_zips

Unnamed: 0,OriginZip,count
0,17072,214441
1,28159,180970
2,36039,179331
3,42323,113596
4,12409,78708
5,90061,71642
6,49782,69506
7,2072,65397
8,27045,58380
9,27152,57055


In [148]:
alt.Chart(origin_zips).mark_bar().encode(
    x=alt.X('OriginZip:N', axis=alt.Axis(title='Origin Zip Code'), sort=None),
    y=alt.Y('count:Q', axis=alt.Axis(title='Count')),
    tooltip=['OriginZip', 'count']
).configure_axisX(
    labels=False
).properties(
    width=2500,
    height=500,
    title='Distribution of Origin Zip Codes'
).configure_title(
    anchor='middle'
)

In [149]:
# view the distribution of destination zip codes. Convert Zip code to string so matplotlib recognizes it as categorical
destination_zips = q1_df['DestinationZip'].value_counts(ascending=False).reset_index()

In [150]:
destination_zips

Unnamed: 0,DestinationZip,count
0,27529,1089
1,70645,819
2,21015,776
3,32702,775
4,80128,746
...,...,...
22375,29653,1
22376,48886,1
22377,77963,1
22378,30277,1


In [151]:
alt.Chart(destination_zips).mark_bar().encode(
    x=alt.X('DestinationZip:N', axis=alt.Axis(title='Destination Zip Code'), sort=None),
    y=alt.Y('count:Q', axis=alt.Axis(title='Count')),
    tooltip=['DestinationZip', 'count']
).configure_axisX(
    labels=False
).properties(
    width=2500,
    height=500,
    title='Distribution of Destination Zip Codes'
).configure_title(
    anchor='middle'
)

In [152]:
# group by orign an destination zip and take sum of total packages
grouped_q1_df = q1_df.groupby(by=['OriginZip', 'DestinationZip']).sum().reset_index()

In [153]:
grouped_q1_df

Unnamed: 0,OriginZip,DestinationZip,OrderCount,Length,Width,Height,WeightOunces
0,02072,01003,2,29.0,19.0,5.0,42.0
1,02072,01005,3,10.0,8.0,3.0,30.0
2,02072,01008,3,31.0,16.0,6.0,17.0
3,02072,01010,3,46.0,26.0,7.0,87.0
4,02072,01012,1,10.0,6.0,1.0,7.0
...,...,...,...,...,...,...,...
323933,98253,99783,6,43.0,56.0,6.0,60.0
323934,98253,99803,1,11.0,14.0,1.0,20.0
323935,98253,99824,2,27.0,34.0,2.0,63.0
323936,98253,99825,3,26.0,35.0,4.0,68.0


In [154]:
grouped_q1_df['Route'] = 'Origin: ' + grouped_q1_df['OriginZip'] + '- Destination: ' + grouped_q1_df['DestinationZip']

In [155]:
# since this is for September, 2022, just create a variable for # of days. This could be dynamic if we had more months, or likely would already have a feature denoting the month.
days_in_september = 30

In [156]:
grouped_q1_df['AveragePerDay'] = (grouped_q1_df['OrderCount'] / days_in_september).round().astype(int)

In [157]:
grouped_q1_df.sort_values('AveragePerDay',ascending=False)

Unnamed: 0,OriginZip,DestinationZip,OrderCount,Length,Width,Height,WeightOunces,Route,AveragePerDay
186778,36039,71322,430,319.0,244.0,125.0,1355.0,Origin: 36039- Destination: 71322,14
63923,17072,27529,270,2956.0,2194.0,831.0,8469.0,Origin: 17072- Destination: 27529,9
60769,17072,06359,239,2550.0,1913.0,648.0,8331.0,Origin: 17072- Destination: 06359,8
61639,17072,12964,215,2162.0,1691.0,648.0,7265.0,Origin: 17072- Destination: 12964,7
64317,17072,29372,205,2193.0,1666.0,505.0,5462.0,Origin: 17072- Destination: 29372,7
...,...,...,...,...,...,...,...,...,...
115095,28159,47346,2,27.0,20.0,2.0,30.0,Origin: 28159- Destination: 47346,0
115094,28159,47345,4,30.0,23.0,10.0,71.0,Origin: 28159- Destination: 47345,0
115093,28159,47342,5,60.0,36.0,12.0,79.0,Origin: 28159- Destination: 47342,0
115092,28159,47341,2,20.0,12.0,2.0,9.0,Origin: 28159- Destination: 47341,0


In [158]:
top_10000 = grouped_q1_df.sort_values('OrderCount', ascending=False)[:10000]

In [159]:
alt.Chart(top_10000).mark_bar().encode(
    x=alt.X('Route:N', axis=alt.Axis(title='Route'), sort=None),
    y=alt.Y('OrderCount:Q', axis=alt.Axis(title='Order Count')),
    tooltip=['Route:N', 'OrderCount']
).configure_axisX(
    labels=False
).properties(
    width=2500,
    height=500,
    title='Distribution of Zip Code Routes by Order Count - Top 10,000'
).configure_title(
    anchor='middle'
)

In [160]:
top_10000_avg = grouped_q1_df.sort_values('AveragePerDay', ascending=False)[:10000]

In [161]:
alt.Chart(top_10000_avg).mark_bar().encode(
    x=alt.X('Route:N', axis=alt.Axis(title='Route'), sort=None),
    y=alt.Y('AveragePerDay:Q', axis=alt.Axis(title='Average Per Day Count', tickMinStep=1)),
    tooltip=['Route', 'AveragePerDay', 'OrderCount']
).configure_axisX(
    labels=False
).properties(
    width=2500,
    height=500,
    title='Distribution of Average Packages per Day - Top 10,000'
).configure_title(
    anchor='middle'
)

In [162]:
# depends what we're trying to accomplish here

In [163]:
destination_monthly_total = grouped_q1_df.groupby('DestinationZip')['OrderCount'].sum().reset_index()

In [164]:
destination_monthly_total.sort_values('OrderCount', ascending=False)

Unnamed: 0,DestinationZip,OrderCount
5893,27529,1266
16157,70645,987
7374,32702,926
4500,21015,895
7937,34770,871
...,...,...
11626,50361,1
7850,34249,1
2224,12188,1
567,03905,1


In [165]:
# https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_zcta_national.zip

In [166]:
zip_code_df = pd.read_csv('data/2020_Gaz_zcta_national.txt', sep='\t')

In [167]:
zip_code_df

Unnamed: 0,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
0,601,166659744,799292,64.348,0.309,18.180555,-66.749961
1,602,79307538,4428428,30.621,1.710,18.361945,-67.175597
2,603,81887203,181412,31.617,0.070,18.455183,-67.119887
3,606,109579950,12487,42.309,0.005,18.158327,-66.932928
4,610,93013430,4172059,35.913,1.611,18.294032,-67.127156
...,...,...,...,...,...,...,...
33139,99923,42495197,2117,16.407,0.001,56.000518,-130.037474
33140,99925,144071036,34333408,55.626,13.256,55.550203,-132.945947
33141,99926,343944586,292859017,132.798,113.074,55.138352,-131.470425
33142,99927,589650055,18041593,227.665,6.966,56.239062,-133.457924


In [168]:
zip_code_df['DestinationZip'] = zip_code_df['GEOID'].astype(str).apply(fill_leading_zeros)

In [169]:
zip_code_df.head()

Unnamed: 0,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG,DestinationZip
0,601,166659744,799292,64.348,0.309,18.180555,-66.749961,601
1,602,79307538,4428428,30.621,1.71,18.361945,-67.175597,602
2,603,81887203,181412,31.617,0.07,18.455183,-67.119887,603
3,606,109579950,12487,42.309,0.005,18.158327,-66.932928,606
4,610,93013430,4172059,35.913,1.611,18.294032,-67.127156,610


In [170]:
merged_df = pd.merge(destination_monthly_total, zip_code_df, on='DestinationZip', how='left')

In [171]:
merged_df.columns

Index(['DestinationZip', 'OrderCount', 'GEOID', 'ALAND', 'AWATER',
       'ALAND_SQMI', 'AWATER_SQMI', 'INTPTLAT',
       'INTPTLONG                                                                                                                                  '],
      dtype='object')

In [172]:
merged_df = merged_df.rename(columns=lambda x: x.replace(' ', ''))

In [173]:
merged_df.columns

Index(['DestinationZip', 'OrderCount', 'GEOID', 'ALAND', 'AWATER',
       'ALAND_SQMI', 'AWATER_SQMI', 'INTPTLAT', 'INTPTLONG'],
      dtype='object')

In [174]:
merged_df = merged_df.dropna()

In [175]:
merged_df.head()

Unnamed: 0,DestinationZip,OrderCount,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
2,1001,14,1001.0,29797658.0,2121390.0,11.505,0.819,42.062368,-72.625754
3,1003,17,1003.0,1842387.0,12788.0,0.711,0.005,42.389698,-72.524009
4,1005,25,1005.0,114638390.0,666424.0,44.262,0.257,42.418884,-72.112077
5,1008,46,1008.0,139349132.0,5109088.0,53.803,1.973,42.190191,-72.954263
6,1010,214,1010.0,90055966.0,1421379.0,34.771,0.549,42.128176,-72.205352


In [176]:
coords = merged_df[['INTPTLAT', 'INTPTLONG']].values

In [177]:
coords

array([[  42.062368,  -72.625754],
       [  42.389698,  -72.524009],
       [  42.418884,  -72.112077],
       ...,
       [  55.138352, -131.470425],
       [  56.239062, -133.457924],
       [  56.370538, -131.693453]])

In [178]:
dbscan_model = DBSCAN(eps=.5, min_samples=100)

In [179]:
labels = dbscan_model.fit_predict(coords)

In [180]:
merged_df['Cluster'] = labels

In [181]:
merged_df.head()

Unnamed: 0,DestinationZip,OrderCount,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG,Cluster
2,1001,14,1001.0,29797658.0,2121390.0,11.505,0.819,42.062368,-72.625754,-1
3,1003,17,1003.0,1842387.0,12788.0,0.711,0.005,42.389698,-72.524009,-1
4,1005,25,1005.0,114638390.0,666424.0,44.262,0.257,42.418884,-72.112077,0
5,1008,46,1008.0,139349132.0,5109088.0,53.803,1.973,42.190191,-72.954263,-1
6,1010,214,1010.0,90055966.0,1421379.0,34.771,0.549,42.128176,-72.205352,0


In [182]:
merged_df.groupby('Cluster').sum().sort_values('OrderCount', ascending=False)

Unnamed: 0_level_0,DestinationZip,OrderCount,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1,0100101003010080101201020010260102701030010320...,1252643,797182080.0,3961544000000.0,85149100000.0,1529560.629,32876.252,585135.843442,-1392493.0
1,0646806473064770651006512065130651406515065170...,77969,11337365.0,26588330000.0,920238300.0,10265.821,355.298,37961.965796,-69580.73
2,1500115003150041500515006150071500915012150141...,27721,5714783.0,14226440000.0,158147000.0,5492.845,61.059,12393.74667,-24536.51
0,0100501010010740108101094014200143801440014520...,26531,855306.0,12129010000.0,700582800.0,4683.034,270.5,13347.707926,-22555.41
3,1725017311173201732117322173251732917340173441...,23817,6161093.0,11517380000.0,562210400.0,4446.888,217.074,11473.051129,-22636.14
7,9000390008900129001490015900169001790018900199...,17721,20198837.0,8090379000.0,717898300.0,3123.714,277.18,7517.578994,-26106.74
4,4630346304463074631046319463204632146322463234...,16154,12087993.0,8882688000.0,228116100.0,3429.627,88.08,8617.459856,-18107.18
5,4800248003480054800948015480174802248025480264...,14105,7137745.0,7277033000.0,234813000.0,2809.68,90.662,6299.684242,-12322.79
6,7500275009750107501375019750227502475025750287...,13032,9737210.0,7601701000.0,325017400.0,2935.043,125.493,4238.259562,-12501.96
8,9400294005940109401494015940209402294025940289...,9747,12487324.0,4989675000.0,146408800.0,1926.521,56.531,4966.940926,-16122.85


In [190]:
alt.Chart(merged_df).mark_point(size=60).encode(
    x=alt.X('INTPTLONG', axis=alt.Axis(title='Longitude')),
    y=alt.Y('INTPTLAT', axis=alt.Axis(title='Latitude')),
    color=alt.Color('Cluster:N'),
    tooltip=['DestinationZip', 'OrderCount', 'Cluster']
).properties(
    height=800,
    width=1600,
    title='Potential Sortation Facilities via DBSCAN - eps=0.5, min_samples=100'
).configure_legend(
    padding=10,
    cornerRadius=10,
    orient='top-right'
).interactive()