### Import Necessary packages

In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/full_data.csv', index_col = 0, parse_dates = [2,3])
df.head()

In [None]:
df.shape

### Modify Dataset for Use

In [None]:
# Drop lat_lng columns as they were only used for manipulation
df.drop(['start_lat_lng', 'end_lat_lng'], axis = 1, inplace = True)
df.to_csv('data/full_data.csv', index = False)

In [None]:
# split dataset into years
years = np.arange(2010,2021,1)

for year in years:
    df[df.started_at.dt.year == year].to_csv(f'data/{year}_data.csv', index = False)

### Create New Dataset
Instead of loading in the whole dataset to calculate total rides per day during the modeling stage, a separate dataset will be created containing the relevant information. 

In [53]:
def get_daily_counts(df):
    return (df['started_at']
           .dt.floor('d')
           .value_counts()
           .rename_axis('date')
           .reset_index(name = 'count'))

In [54]:
import glob

datafiles = glob.glob('data/20*.csv')

daily_trips = []
central_trips = []
outskirts_trips = []

for file in datafiles:
    tmp = pd.read_csv(file, parse_dates = [1,2])
    
    lat_bool = (tmp['start_lat'] > 38.88) & (tmp['start_lat'] < 38.92)
    lng_bool = (tmp['start_lng'] > -77.05) & (tmp['start_lng'] < -76.97)
    
    central = tmp[(lat_bool) & (lng_bool)]
    outskirts = tmp[~((lat_bool) & (lng_bool))]
    
    daily_count = get_daily_counts(tmp)
    central_count = get_daily_counts(central)
    outskirts_count = get_daily_counts(outskirts)  
    
    daily_trips.append(daily_count)
    central_trips.append(central_count)
    outskirts_trips.append(outskirts_count)
    
daily_trips_df = pd.concat(daily_trips, axis = 0, ignore_index = True)
central_trips_df = pd.concat(central_trips, axis = 0, ignore_index = True)
outskirts_trips_df = pd.concat(outskirts_trips, axis = 0, ignore_index = True)    

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [55]:
daily_trips_df.sort_values(by = 'date', ignore_index = True, inplace = True)
central_trips_df.sort_values(by = 'date', ignore_index = True, inplace = True)
outskirts_trips_df.sort_values(by = 'date', ignore_index = True, inplace = True)

In [58]:
daily_trips_df.to_csv('data/trips_by_day.csv', index = False)
central_trips_df.to_csv('data/central_trips.csv', index = False)
outskirts_trips_df.to_csv('data/outskirts_trips.csv', index = False)