In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')


df = pd.read_csv('lyft_weight.csv')

In [2]:
df_aggs = df.groupby(['top_20_markets', 'segment', 'tag_c'], as_index=False).agg({'case_count':['mean','std']})
df_aggs.columns = ['top_20_markets', 'segment','tag_c', 'case_mean', 'case_std']
df_aggs.reindex(columns=sorted(df_aggs.columns))
df_aggs['case_std'] = df_aggs['case_std'].fillna(0)
print(df_aggs)

     top_20_markets          segment              tag_c  case_mean  case_std
0           Atlanta          Airport  Booked By Mistake   3.000000  2.954196
1           Atlanta          Airport  Booked by Mistake   1.000000  0.000000
2           Atlanta          Airport            Booking   6.428571  6.559952
3           Atlanta          Airport                CCR   1.000000  0.000000
4           Atlanta          Airport       CCR Feedback   1.000000  0.000000
...             ...              ...                ...        ...       ...
3396     Washington  Other Transient     Update Profile   1.000000  0.000000
3397     Washington  Other Transient     Vehicle Damage   1.000000  0.000000
3398     Washington  Other Transient    Vehicle Details   1.469388  1.002124
3399     Washington  Other Transient     Wasn't Present   2.000000  1.745743
3400     Washington  Other Transient       Wrong Number   1.000000  0.000000

[3401 rows x 5 columns]


In [72]:
def is_upper_outlier(x):
    IQR = np.percentile(x, 75) - np.percentile(x, 25)
    upper_fence = np.percentile(x, 75) + (IQR * 1)
    return (x > upper_fence)
df['case_outlier'] = df.groupby(['top_20_markets','segment','tag_c'])['case_count'].apply(is_upper_outlier)

print(df.head(100))

            operator_name  parking_spot_id                tag_c  \
0                     SP+            14764      Equipment Error   
1                     ABM            10329      Vehicle Details   
2             LAZ Parking             5018             Lot Full   
3   Millennium Park Plaza             5956           Redemption   
4                     SP+             6008           Redemption   
..                    ...              ...                  ...   
95              InterPark             2180   Change Date / Time   
96            LAZ Parking              620             Lot Full   
97         Legacy Parking            14429      Equipment Error   
98          REEF - Impark            17976             Lot Full   
99         Legacy Parking             8861  Outside Reservation   

           tag_category top_20_markets          segment  case_count  \
0              Facility        Chicago  Other Transient         193   
1   Reservation Actions        Chicago            Eve

In [4]:
def two_stdevs(x):
    upper_limit = np.mean(x) + (np.std(x) * 2)
    return x > upper_limit
df['case_two_stdevs'] = df.groupby(['top_20_markets','segment','tag_c'])['case_count'].apply(two_stdevs)
print(df.head(100))

            operator_name  parking_spot_id                tag_c  \
0                     SP+            14764      Equipment Error   
1                     ABM            10329      Vehicle Details   
2             LAZ Parking             5018             Lot Full   
3   Millennium Park Plaza             5956           Redemption   
4                     SP+             6008           Redemption   
..                    ...              ...                  ...   
95              InterPark             2180   Change Date / Time   
96            LAZ Parking              620             Lot Full   
97         Legacy Parking            14429      Equipment Error   
98          REEF - Impark            17976             Lot Full   
99         Legacy Parking             8861  Outside Reservation   

           tag_category top_20_markets          segment  case_count  \
0              Facility        Chicago  Other Transient         193   
1   Reservation Actions        Chicago            Eve

In [5]:
print(df[df['case_outlier'] == False].iloc[0])

operator_name                  SP+
parking_spot_id              12729
tag_c              Equipment Error
tag_category              Facility
top_20_markets             Chicago
segment            Other Transient
case_count                      62
case_outlier                 False
case_two_stdevs              False
Name: 100, dtype: object


In [6]:
df['duplicates'] = df.duplicated(subset=['parking_spot_id'],keep='first')
print(df.head(100))

            operator_name  parking_spot_id                tag_c  \
0                     SP+            14764      Equipment Error   
1                     ABM            10329      Vehicle Details   
2             LAZ Parking             5018             Lot Full   
3   Millennium Park Plaza             5956           Redemption   
4                     SP+             6008           Redemption   
..                    ...              ...                  ...   
95              InterPark             2180   Change Date / Time   
96            LAZ Parking              620             Lot Full   
97         Legacy Parking            14429      Equipment Error   
98          REEF - Impark            17976             Lot Full   
99         Legacy Parking             8861  Outside Reservation   

           tag_category top_20_markets          segment  case_count  \
0              Facility        Chicago  Other Transient         193   
1   Reservation Actions        Chicago            Eve

In [7]:
df['duplicates'].value_counts()

True     66338
False     6109
Name: duplicates, dtype: int64

In [8]:
df[df['duplicates'] == False]['case_two_stdevs'].value_counts()

False    5272
True      815
Name: case_two_stdevs, dtype: int64

In [9]:
df[df['duplicates'] == False]['case_outlier'].value_counts()

False    4786
True     1301
Name: case_outlier, dtype: int64

In [10]:
df['market_two_stdevs'] = df.groupby(['top_20_markets','segment'])['case_count'].apply(two_stdevs)
df['market_outlier'] = df.groupby(['top_20_markets','segment'])['case_count'].apply(is_upper_outlier)
print(df[df['duplicates'] == False]['market_two_stdevs'].value_counts())
print(df[df['duplicates'] == False]['market_outlier'].value_counts())

False    4982
True     1127
Name: market_two_stdevs, dtype: int64
False    3673
True     2436
Name: market_outlier, dtype: int64


In [11]:
df['segment_two_stdevs'] = df.groupby(['segment'])['case_count'].apply(two_stdevs)
df['segment_outlier'] = df.groupby(['segment'])['case_count'].apply(is_upper_outlier)
print(df[df['duplicates'] == False]['segment_two_stdevs'].value_counts())
print(df[df['duplicates'] == False]['segment_outlier'].value_counts())

False    5151
True      958
Name: segment_two_stdevs, dtype: int64
False    3884
True     2225
Name: segment_outlier, dtype: int64


In [12]:
df.to_csv('lyft_weight_filters.csv',index=False)

In [13]:
num_quantiles = 10
df_case_quantiles = df.groupby(['top_20_markets', 'segment','tag_c'])[['case_count']].quantile([i * (1.0 / num_quantiles) for i in range(num_quantiles)])
df_case_quantiles.index = df_case_quantiles.index.set_names(['top_20_markets','segment','tag_c','quantile'])
df_case_quantiles = df_case_quantiles.reset_index()

In [14]:
print(df_case_quantiles.head())

  top_20_markets  segment              tag_c  quantile  case_count
0        Atlanta  Airport  Booked By Mistake       0.0         1.0
1        Atlanta  Airport  Booked By Mistake       0.1         1.0
2        Atlanta  Airport  Booked By Mistake       0.2         1.0
3        Atlanta  Airport  Booked By Mistake       0.3         1.0
4        Atlanta  Airport  Booked By Mistake       0.4         1.4


In [15]:
df_case_quantiles_tenth = df_case_quantiles[df_case_quantiles['quantile'] == 0.9]
print(df_case_quantiles_tenth.head(100))

    top_20_markets  segment              tag_c  quantile  case_count
9          Atlanta  Airport  Booked By Mistake       0.9         5.8
19         Atlanta  Airport  Booked by Mistake       0.9         1.0
29         Atlanta  Airport            Booking       0.9        15.3
39         Atlanta  Airport                CCR       0.9         1.0
49         Atlanta  Airport       CCR Feedback       0.9         1.0
..             ...      ...                ...       ...         ...
959        Atlanta    Event         Unfamiliar       0.9         1.0
969        Atlanta    Event     Update Profile       0.9         1.0
979        Atlanta    Event     Vehicle Damage       0.9         1.0
989        Atlanta    Event    Vehicle Details       0.9         4.0
999        Atlanta    Event     Wasn't Present       0.9         2.2

[100 rows x 5 columns]


In [16]:
df_case_quantiles_tenth.to_csv('tenth_decile_by_case.csv',index=False)

In [18]:
df_propor = pd.read_csv('segment_case_proportions.csv')
print(df_propor.head())

   parking_spot_id          segment  case_count  rental_count       cpp
0              166  Other Transient          34         958.0  0.035491
1              289         Commuter          13         521.0  0.024952
2              289  Other Transient          12         690.0  0.017391
3              305            Event          11          67.0  0.164179
4              311  Other Transient           1           NaN       NaN


In [21]:
df_propor = df_propor.fillna(0)
print(df_propor.head())

   parking_spot_id          segment  case_count  rental_count       cpp
0              166  Other Transient          34         958.0  0.035491
1              289         Commuter          13         521.0  0.024952
2              289  Other Transient          12         690.0  0.017391
3              305            Event          11          67.0  0.164179
4              311  Other Transient           1           0.0  0.000000


In [24]:
df_propor = df_propor.sort_values(by='rental_count', ascending=False)
df_propor['duplicates'] = df_propor.duplicated(subset=['parking_spot_id'],keep='first')
df_propor['segment_two_stdevs'] = df_propor.groupby(['segment'])['cpp'].apply(two_stdevs)
df_propor['segment_outlier'] = df_propor.groupby(['segment'])['cpp'].apply(is_upper_outlier)
print(df_propor[df_propor['duplicates'] == False]['segment_two_stdevs'].value_counts())
print(df_propor[df_propor['duplicates'] == False]['segment_outlier'].value_counts())

False    5966
True      181
Name: segment_two_stdevs, dtype: int64
False    5557
True      590
Name: segment_outlier, dtype: int64


In [26]:
df_propor.to_csv('segment_case_proportion_filters.csv',index=False)

In [27]:
total_cases = 199737
total_rentals = 4595732

cases_remaining = df_propor[df_propor['duplicates'] == False]['case_count'].sum()
cases_removed = 1-(cases_remaining / total_cases)

rentals_remaining = df_propor[df_propor['duplicates'] == False]['rental_count'].sum()
rentals_removed = 1-(rentals_remaining / total_rentals)

print(cases_remaining)
print(cases_removed)
print(rentals_remaining)
print(rentals_removed)

132900
0.3346250319169708
3167544.0
0.31076398710803854


In [29]:
df_propor['two_stdevs'] = df_propor['cpp'].apply(two_stdevs)
df_propor['outlier'] = df_propor['cpp'].apply(is_upper_outlier)
print(df_propor['two_stdevs'].value_counts())
print(df_propor['outlier'].value_counts())

False    12209
Name: two_stdevs, dtype: int64
False    12209
Name: outlier, dtype: int64


In [73]:
def upper_fence_actual(x):
    IQR = np.percentile(x, 75) - np.percentile(x, 25)
    return np.percentile(x, 75) + (IQR * 1)
#df_propor['upper_fence_actual'] = df_propor.groupby(['segment'])['cpp'].apply(upper_fence_actual)
print(df_propor.groupby(['segment'])['cpp'].apply(upper_fence_actual))

segment
Airport            0.253545
Commuter           0.206349
Event              0.400392
Monthly            1.788968
Other Transient    0.203194
Name: cpp, dtype: float64


In [84]:
final_index = df_propor[df_propor['duplicates'] == False]
final_index = final_index[final_index['outlier'] == False]
final_index = final_index[['parking_spot_id']].set_index('parking_spot_id')
final_index['dupes'] = final_index.duplicated(subset=['parking_spot_id'],keep='first')
print(final_index['dupes'].value_counts())

Series([], Name: dupes, dtype: int64)


In [56]:
print(df)

               operator_name  parking_spot_id              tag_c  \
0                        SP+            14764    Equipment Error   
1                        ABM            10329    Vehicle Details   
2                LAZ Parking             5018           Lot Full   
3      Millennium Park Plaza             5956         Redemption   
4                        SP+             6008         Redemption   
...                      ...              ...                ...   
72442              InterPark             9246  Booked By Mistake   
72443            LAZ Parking             9752     Wasn't Present   
72444           City Parking            13380    Dropped Contact   
72445                    SP+             7527         Redemption   
72446                    ABM             1323           Feedback   

              tag_category top_20_markets          segment  case_count  \
0                 Facility        Chicago  Other Transient         193   
1      Reservation Actions        C

In [58]:
df = df.assign(filter_index=df['parking_spot_id'].isin(final_index['parking_spot_id']).astype(int))
print(df)

               operator_name  parking_spot_id              tag_c  \
0                        SP+            14764    Equipment Error   
1                        ABM            10329    Vehicle Details   
2                LAZ Parking             5018           Lot Full   
3      Millennium Park Plaza             5956         Redemption   
4                        SP+             6008         Redemption   
...                      ...              ...                ...   
72442              InterPark             9246  Booked By Mistake   
72443            LAZ Parking             9752     Wasn't Present   
72444           City Parking            13380    Dropped Contact   
72445                    SP+             7527         Redemption   
72446                    ABM             1323           Feedback   

              tag_category top_20_markets          segment  case_count  \
0                 Facility        Chicago  Other Transient         193   
1      Reservation Actions        C

In [59]:
df[df['filter_index'] == 1]['case_count'].sum()

196429

In [61]:
test = pd.read_csv('segment_case_proportions.csv')
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12209 entries, 0 to 12208
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   parking_spot_id  12209 non-null  int64  
 1   segment          12209 non-null  object 
 2   case_count       12209 non-null  int64  
 3   rental_count     12156 non-null  float64
 4   cpp              11818 non-null  float64
dtypes: float64(2), int64(2), object(1)
memory usage: 477.0+ KB
None


In [62]:
test = test.dropna()
print(test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11818 entries, 0 to 12208
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   parking_spot_id  11818 non-null  int64  
 1   segment          11818 non-null  object 
 2   case_count       11818 non-null  int64  
 3   rental_count     11818 non-null  float64
 4   cpp              11818 non-null  float64
dtypes: float64(2), int64(2), object(1)
memory usage: 554.0+ KB
None


In [63]:
test = test.sort_values(by='rental_count', ascending=False)
df_propor['duplicates'] = df_propor.duplicated(subset=['parking_spot_id'],keep='first')
test['segment_two_stdevs'] = test.groupby(['segment'])['cpp'].apply(two_stdevs)
test['segment_outlier'] = test.groupby(['segment'])['cpp'].apply(is_upper_outlier)
print(test[test['duplicates'] == False]['segment_two_stdevs'].value_counts())
print(test[test['duplicates'] == False]['segment_outlier'].value_counts())

KeyError: 'duplicates'

In [74]:
facility_propor = pd.read_csv('facility_case_proportions.csv')
sum_cases = facility_propor['case_count'].sum()
sum_rentals = facility_propor['rental_count'].sum()
facility_propor = facility_propor.dropna()
avg_cpp = facility_propor['cpp'].mean()
facility_propor['two_stdevs'] = two_stdevs(facility_propor['cpp'])
facility_propor['outlier'] = is_upper_outlier(facility_propor['cpp'])
filter_cases = facility_propor[facility_propor['outlier'] == False]['case_count'].sum()
filter_rentals = facility_propor[facility_propor['outlier'] == False]['rental_count'].sum()

print(filter_cases / sum_cases)
print(filter_rentals / sum_rentals)

0.944578014008246
0.9950462967930217


In [75]:
print(upper_fence_actual(facility_propor['cpp']))

0.2614732641866278


In [76]:
facility_propor_filtered = facility_propor[facility_propor['outlier'] == False]
df = df.assign(filter_index2=df['parking_spot_id'].isin(facility_propor_filtered['parking_spot_id']).astype(int))
df.to_csv('total_facility_filter.csv')

In [78]:
df_propor = df_propor.assign(filter_index=df_propor['parking_spot_id'].isin(facility_propor_filtered['parking_spot_id']).astype(int))
df_propor.to_csv('segment_case_proportion_filters2.csv',index=False)