### Station Data

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
stations = pd.read_csv('station_availability_df', index_col=0).reset_index()

#combine hour and date to create timestamp
stations['date'] = pd.to_datetime(stations[['date', 'hour']].astype(str).apply(lambda r: ' '.join(r), axis=1) + ':00:00', format = '%Y-%m-%d %H:%M:%S') 
station_availabilities = stations.loc[stations['in_service']==1,['dock_id','date','avail_bikes','avail_docks']]
station_availabilities.head(1)

Unnamed: 0,dock_id,date,avail_bikes,avail_docks
0,72,2017-01-01 01:00:00,26,11


### Bike Data

In [3]:
bike_trips = pd.read_csv('bike_trip_df',index_col = 0).reset_index()

In [4]:
import datetime
bike_trips['starttime'] = pd.to_datetime(bike_trips['starttime'], format = '%Y-%m-%d %H:%M:%S')
bike_trips['stoptime'] = pd.to_datetime(bike_trips['stoptime'], format = '%Y-%m-%d %H:%M:%S')

#round all times to earliest hour
bike_trips['starttime'] = bike_trips['starttime'].apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))
bike_trips['stoptime'] = bike_trips['stoptime'].apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))

#create two dataframes counting bike trips aggregated by timestamp and station id
outgoing_trips = bike_trips[['starttime','start station id']]
incoming_trips = bike_trips[['stoptime','end station id']]
incoming_bike_trips = incoming_trips.groupby(['stoptime','end station id']).size().reset_index()
outgoing_bike_trips = outgoing_trips.groupby(['starttime','start station id']).size().reset_index()

In [5]:
#merge two dataframes to find net bikes being added to the station
all_trips = pd.merge(incoming_bike_trips,outgoing_bike_trips, left_on = ['stoptime','end station id'], right_on = ['starttime','start station id'], copy = False)
all_trips = all_trips.rename(index = str, columns = {"stoptime": "date", "end station id":"dock_id", "0_x":"incoming_bikes","0_y":"outgoing_bikes"}).drop(columns = ["starttime","start station id"])
all_trips['net_bikes'] = all_trips.incoming_bikes-all_trips.outgoing_bikes
all_trips = all_trips.drop(columns = ['incoming_bikes','outgoing_bikes'])

all_trips.head(1)

Unnamed: 0,date,dock_id,net_bikes
0,2017-01-01,128,0


### Rebalancing Calculations

In [6]:
#combine station availability data to net incoming bike data
df = pd.merge(station_availabilities,all_trips, on = ['dock_id','date'], how = 'left', copy = False).sort_values(by = ['dock_id', 'date'], ascending = True)
#replace all instances when no bikes left or arrived with 0
df.net_bikes.fillna(0,inplace=True)
df.head(1)

Unnamed: 0,dock_id,date,avail_bikes,avail_docks,net_bikes
10,72,2017-01-01,25,12,0.0


In [7]:
#create column calculating how many bikes should be available at the next hour
df['theoretical_eoh_avail_bikes'] = df.avail_bikes+df.net_bikes
df = df.reset_index(drop=True)
df.head(5)

Unnamed: 0,dock_id,date,avail_bikes,avail_docks,net_bikes,theoretical_eoh_avail_bikes
0,72,2017-01-01 00:00:00,25,12,0.0,25.0
1,72,2017-01-01 01:00:00,26,11,0.0,26.0
2,72,2017-01-01 02:00:00,26,11,0.0,26.0
3,72,2017-01-01 03:00:00,26,11,0.0,26.0
4,72,2017-01-01 04:00:00,26,11,0.0,26.0


In [8]:
#shift inventory numbers back one hour to compare between how many bikes should be available 
#and how many bikes actually are
df['actual_eoh_avail_bikes'] = df.groupby(['dock_id'])['avail_bikes'].shift(-1)

In [9]:
df.actual_eoh_avail_bikes = df.actual_eoh_avail_bikes.fillna(df.theoretical_eoh_avail_bikes)
df.head(5)

Unnamed: 0,dock_id,date,avail_bikes,avail_docks,net_bikes,theoretical_eoh_avail_bikes,actual_eoh_avail_bikes
0,72,2017-01-01 00:00:00,25,12,0.0,25.0,26.0
1,72,2017-01-01 01:00:00,26,11,0.0,26.0,26.0
2,72,2017-01-01 02:00:00,26,11,0.0,26.0,26.0
3,72,2017-01-01 03:00:00,26,11,0.0,26.0,26.0
4,72,2017-01-01 04:00:00,26,11,0.0,26.0,26.0


In [10]:
df['bikes_added_by_citibike'] = df.actual_eoh_avail_bikes - df.theoretical_eoh_avail_bikes
df.head(5)

Unnamed: 0,dock_id,date,avail_bikes,avail_docks,net_bikes,theoretical_eoh_avail_bikes,actual_eoh_avail_bikes,bikes_added_by_citibike
0,72,2017-01-01 00:00:00,25,12,0.0,25.0,26.0,1.0
1,72,2017-01-01 01:00:00,26,11,0.0,26.0,26.0,0.0
2,72,2017-01-01 02:00:00,26,11,0.0,26.0,26.0,0.0
3,72,2017-01-01 03:00:00,26,11,0.0,26.0,26.0,0.0
4,72,2017-01-01 04:00:00,26,11,0.0,26.0,26.0,0.0


### Labeling

In [88]:
no_rebalancing_df = df[['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()

In [43]:
no_rebalancing_df.head(1)

Unnamed: 0,dock_id,date,avail_bikes,avail_docks,net_bikes,bikes_added_by_citibike
0,72,2017-01-01,25,12,0.0,1.0


In [83]:
### this is to get indexes of full or empty stations

#loop through each row
no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()

full_stations = []
empty_stations = []
start_point = 0

#how do i get the for loop to go back one step
for i in no_rebalancing_df.index:
    #if available bikes hits 0, save index and set the start point to that row index
    if no_rebalancing_df.iloc[i].avail_bikes<=0:
        empty_stations.append(no_rebalancing_df.iloc[i].name)
        start_point = i
        no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()
    #if available bikes hits 0, save index and set the start point to that row index
    elif no_rebalancing_df.iloc[i].avail_docks<=0:
        full_stations.append(no_rebalancing_df.iloc[i].name)
        start_point = i
        no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()
    #if bikes_added_by_citibike is not 0
    elif no_rebalancing_df.bikes_added_by_citibike[i]!=0:
        #subtract that number from all avail_bikes and add that number to all avail_docks for that dock_id after that time
        #save the dock_id of the row we're looking at
        dock_id = no_rebalancing_df.iloc[i].dock_id
        rebalancing_int = no_rebalancing_df.iloc[i].bikes_added_by_citibike
        #find the last row with the same dock_id, add 1, and subtract rebalanced bike number from slice from index + 1 to last row
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_bikes'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_bikes - rebalancing_int
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_docks'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_docks + rebalancing_int
        
        
        
        

In [84]:
### this is to check the code


#loop through each row
no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()

#how do i get the for loop to go back one step
for i in no_rebalancing_df.index:
    #if available bikes hits 0, save index and set the start point to that row index
    if no_rebalancing_df.bikes_added_by_citibike[i]!=0:
        #subtract that number from all avail_bikes and add that number to all avail_docks for that dock_id after that time
        #save the dock_id of the row we're looking at
        dock_id = no_rebalancing_df.iloc[i].dock_id
        rebalancing_int = no_rebalancing_df.iloc[i].bikes_added_by_citibike
        #find the last row with the same dock_id, add 1, and subtract rebalanced bike number from slice from index + 1 to last row
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_bikes'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_bikes - rebalancing_int
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_docks'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_docks + rebalancing_int
        
        
        
        

In [85]:
print(full_stations)
print(empty_stations)
print(start_point)

[1089, 1737, 1892, 2197, 2330, 2755, 2878, 2981, 3185, 3224, 3676, 3777, 3880, 4034, 4106, 4174, 4236, 4280, 4337, 4488, 4513, 4633, 4705, 4821, 4893, 4908, 4924, 5081, 5122, 5127, 5261, 5328, 5415, 5519, 5578, 5685, 5712, 5784, 5809, 5838, 5892, 5930, 6087, 6091, 6144, 6169, 6206, 6352, 6436, 6562, 6572, 6577, 6580, 6581, 6582, 6703, 6706, 7008, 7126, 7147, 7148, 7149, 7179, 7324]
[81, 173, 440, 443, 448, 463, 464, 476, 502, 503, 516, 535, 557, 595, 604, 607, 608, 609, 663, 707, 710, 711, 732, 737, 781, 789, 897, 900, 920, 922, 1123, 1163, 1164, 1277, 1279, 1336, 1368, 1370, 1419, 1444, 1750, 1752, 1780, 1794, 1814, 1930, 2075, 2106, 2192, 2212, 2213, 2229, 2233, 2295, 2297, 2298, 2334, 2419, 2455, 2476, 2477, 2478, 2511, 2585, 2621, 2724, 2813, 2815, 2816, 2817, 2819, 2820, 2821, 2896, 2897, 2903, 2905, 2929, 3003, 3007, 3039, 3040, 3041, 3134, 3205, 3210, 3259, 3350, 3384, 3389, 3421, 3452, 3455, 3456, 3465, 3492, 3493, 3494, 3504, 3506, 3527, 3531, 3679, 3682, 3687, 3688, 3690, 380

In [86]:
pd.set_option('display.max_rows', 10000)
no_rebalancing_df.loc[no_rebalancing_df['dock_id']==72,:]

Unnamed: 0,dock_id,date,avail_bikes,avail_docks,net_bikes,bikes_added_by_citibike
0,72,2017-01-01 00:00:00,25.0,12.0,0.0,1.0
1,72,2017-01-01 01:00:00,25.0,12.0,0.0,0.0
2,72,2017-01-01 02:00:00,25.0,12.0,0.0,0.0
3,72,2017-01-01 03:00:00,25.0,12.0,0.0,0.0
4,72,2017-01-01 04:00:00,25.0,12.0,0.0,0.0
5,72,2017-01-01 05:00:00,25.0,12.0,0.0,-2.0
6,72,2017-01-01 06:00:00,25.0,12.0,0.0,0.0
7,72,2017-01-01 07:00:00,25.0,12.0,0.0,0.0
8,72,2017-01-01 08:00:00,25.0,12.0,0.0,0.0
9,72,2017-01-01 09:00:00,25.0,12.0,0.0,-2.0


In [89]:
no_rebalancing_df.head()

Unnamed: 0,dock_id,date,avail_bikes,avail_docks,net_bikes,bikes_added_by_citibike
0,72,2017-01-01 00:00:00,25,12,0.0,1.0
1,72,2017-01-01 01:00:00,26,11,0.0,0.0
2,72,2017-01-01 02:00:00,26,11,0.0,0.0
3,72,2017-01-01 03:00:00,26,11,0.0,0.0
4,72,2017-01-01 04:00:00,26,11,0.0,0.0


In [None]:
#loop through each row
no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()

full_stations = []
empty_stations = []
i = 0


#to do: how do i get the for loop to go back one step
while i < len(no_rebalancing_df):
    #if available bikes hits 0, save index and set the start point to that row index
    if no_rebalancing_df.iloc[i].avail_bikes==0:
        empty_stations.append(no_rebalancing_df.iloc[i].name)
        no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()
        i += 1
    #if available bikes hits 0, save index and set the start point to that row index
    elif no_rebalancing_df.iloc[i].avail_docks==0:
        full_stations.append(no_rebalancing_df.iloc[i].name)
        no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()
        i += 1
    #if bikes_added_by_citibike is not 0
    elif no_rebalancing_df.bikes_added_by_citibike[i]!=0:
        #subtract that number from all avail_bikes and add that number to all avail_docks for that dock_id after that time
        #save the dock_id of the row we're looking at
        dock_id = no_rebalancing_df.iloc[i].dock_id
        rebalancing_int = no_rebalancing_df.iloc[i].bikes_added_by_citibike
        #find the last row with the same dock_id, add 1, and subtract rebalanced bike number from slice from index + 1 to last row
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_bikes'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_bikes - rebalancing_int
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_docks'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_docks + rebalancing_int
        i += 1

In [None]:
### this is to get indexes of full or empty stations

#loop through each row
no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()

full_stations = []
empty_stations = []
i = 0

#how do i get the for loop to go back one step
while i < len(no_rebalancing_df):
    #if available bikes hits 0, save index and set the start point to that row index
    if no_rebalancing_df.iloc[i].avail_bikes<=0 and no_rebalancing_df.iloc[i - 1].bikes_added_by_citibike != 0:
        empty_stations.append(no_rebalancing_df.iloc[i].name)
        no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()
    #if available bikes hits 0, save index and set the start point to that row index
    elif no_rebalancing_df.iloc[i].avail_docks<=0 and no_rebalancing_df.iloc[i - 1].bikes_added_by_citibike != 0:
        full_stations.append(no_rebalancing_df.iloc[i].name)
        no_rebalancing_df = df.loc[df['dock_id']==72,['dock_id','date','avail_bikes','avail_docks','net_bikes','bikes_added_by_citibike']].copy()
    #if bikes_added_by_citibike is not 0
    elif no_rebalancing_df.bikes_added_by_citibike[i]!=0:
        #subtract that number from all avail_bikes and add that number to all avail_docks for that dock_id after that time
        #save the dock_id of the row we're looking at
        dock_id = no_rebalancing_df.iloc[i].dock_id
        rebalancing_int = no_rebalancing_df.iloc[i].bikes_added_by_citibike
        #find the last row with the same dock_id, add 1, and subtract rebalanced bike number from slice from index + 1 to last row
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_bikes'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_bikes - rebalancing_int
        no_rebalancing_df.loc[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1,'avail_docks'] = no_rebalancing_df[i+1:no_rebalancing_df[no_rebalancing_df.dock_id == dock_id].index.max() + 1].avail_docks + rebalancing_int
        i+=1
    else:
        i+=1

In [None]:
print(full_stations)
print(empty_stations)
print(start_point)