In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
import math
import glob
import os
import seaborn as sns

In [2]:
path = r"/Users/Oh/Documents/CodeAcademyBerlin/bike_analysis/data/years_2010_to_2017"
all_files = glob.glob(os.path.join(path, "*.csv"))
all_files.sort() # Files are arranged already in correct alphabetical order in the folder, so we want to preserve the order.

df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)



Run time: 40 s

<h3> Dataframe with daily values </h3>
First, create a dataframe for keeping track of daily values. <br>
Further below, will create a df for hourly values.

In [3]:
# Create a dataframe for the daily values

df_all_years = concatenated_df.copy(deep = True)        # Want to preserve the original "concatenated_df"

In [4]:
df_all_years.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,1012,2010-09-20 11:27:04,2010-09-20 11:43:56,31208,M St & New Jersey Ave SE,31108,4th & M St SW,W00742,Member
1,61,2010-09-20 11:41:22,2010-09-20 11:42:23,31209,1st & N St SE,31209,1st & N St SE,W00032,Member
2,2690,2010-09-20 12:05:37,2010-09-20 12:50:27,31600,5th & K St NW,31100,19th St & Pennsylvania Ave NW,W00993,Member
3,1406,2010-09-20 12:06:05,2010-09-20 12:29:32,31600,5th & K St NW,31602,Park Rd & Holmead Pl NW,W00344,Member
4,1413,2010-09-20 12:10:43,2010-09-20 12:34:17,31100,19th St & Pennsylvania Ave NW,31201,15th & P St NW,W00883,Member


In [5]:
df_all_years['Start date'] = pd.to_datetime(df_all_years['Start date'])

In [6]:
df_all_years['End date'] = pd.to_datetime(df_all_years['End date'])

In [7]:

"""
# Remove the time. In this dataframe, we only want to keep track of daily values. 
df_all_years['Start date'] = df_all_years['Start date'].dt.date
"""

"\n# Remove the time. In this dataframe, we only want to keep track of daily values. \ndf_all_years['Start date'] = df_all_years['Start date'].dt.date\n"

In [8]:
df_all_years.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,1012,2010-09-20 11:27:04,2010-09-20 11:43:56,31208,M St & New Jersey Ave SE,31108,4th & M St SW,W00742,Member
1,61,2010-09-20 11:41:22,2010-09-20 11:42:23,31209,1st & N St SE,31209,1st & N St SE,W00032,Member
2,2690,2010-09-20 12:05:37,2010-09-20 12:50:27,31600,5th & K St NW,31100,19th St & Pennsylvania Ave NW,W00993,Member
3,1406,2010-09-20 12:06:05,2010-09-20 12:29:32,31600,5th & K St NW,31602,Park Rd & Holmead Pl NW,W00344,Member
4,1413,2010-09-20 12:10:43,2010-09-20 12:34:17,31100,19th St & Pennsylvania Ave NW,31201,15th & P St NW,W00883,Member


In [9]:
df_all_years.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19117643 entries, 0 to 19117642
Data columns (total 9 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Duration              int64         
 1   Start date            datetime64[ns]
 2   End date              datetime64[ns]
 3   Start station number  int64         
 4   Start station         object        
 5   End station number    int64         
 6   End station           object        
 7   Bike number           object        
 8   Member type           object        
dtypes: datetime64[ns](2), int64(3), object(4)
memory usage: 1.3+ GB


Select rows where Member type == 'Member' (i.e. 'registered') <br>
Then do groupby + agg to find total number of registered users per day

In [10]:
df_agg_registered = df_all_years [df_all_years['Member type'] == 'Member'].groupby(pd.Grouper(key = 'Start date', freq = 'D')).count().rename(columns={'Member type': 'Registered'})

In [11]:
df_agg_registered.head()

Unnamed: 0_level_0,Duration,End date,Start station number,Start station,End station number,End station,Bike number,Registered
Start date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-09-20,178,178,178,178,178,178,178,178
2010-09-21,215,215,215,215,215,215,215,215
2010-09-22,260,260,260,260,260,260,260,260
2010-09-23,249,249,249,249,249,249,249,249
2010-09-24,206,206,206,206,206,206,206,206


In [12]:
df_agg_registered.reset_index(inplace = True)

In [13]:
df_agg_registered.head()

Unnamed: 0,Start date,Duration,End date,Start station number,Start station,End station number,End station,Bike number,Registered
0,2010-09-20,178,178,178,178,178,178,178,178
1,2010-09-21,215,215,215,215,215,215,215,215
2,2010-09-22,260,260,260,260,260,260,260,260
3,2010-09-23,249,249,249,249,249,249,249,249
4,2010-09-24,206,206,206,206,206,206,206,206


In [14]:
df_agg_registered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2660 entries, 0 to 2659
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Start date            2660 non-null   datetime64[ns]
 1   Duration              2660 non-null   int64         
 2   End date              2660 non-null   int64         
 3   Start station number  2660 non-null   int64         
 4   Start station         2660 non-null   int64         
 5   End station number    2660 non-null   int64         
 6   End station           2660 non-null   int64         
 7   Bike number           2660 non-null   int64         
 8   Registered            2660 non-null   int64         
dtypes: datetime64[ns](1), int64(8)
memory usage: 187.2 KB


In [15]:
df_agg_registered = df_agg_registered[['Start date', 'Registered']]

In [16]:
# This contains number of registered users for each day.
df_agg_registered

Unnamed: 0,Start date,Registered
0,2010-09-20,178
1,2010-09-21,215
2,2010-09-22,260
3,2010-09-23,249
4,2010-09-24,206
...,...,...
2655,2017-12-27,2934
2656,2017-12-28,2618
2657,2017-12-29,2744
2658,2017-12-30,1633


Select rows where Member type == 'Casual' <br>
Then do groupby + agg to find total number of casual users per day

In [17]:
df_agg_casual = df_all_years [df_all_years['Member type'] == 'Casual'].groupby(pd.Grouper(key = 'Start date', freq = 'D')).count().rename(columns={'Member type': 'Casual'})

In [18]:
df_agg_casual = df_agg_casual.reset_index()

In [19]:
df_agg_casual = df_agg_casual [['Start date', 'Casual']]

In [20]:
df_agg_casual

Unnamed: 0,Start date,Casual
0,2010-09-20,34
1,2010-09-21,109
2,2010-09-22,117
3,2010-09-23,124
4,2010-09-24,156
...,...,...
2655,2017-12-27,387
2656,2017-12-28,248
2657,2017-12-29,344
2658,2017-12-30,243


In [21]:
df_all_agg_daily = pd.merge(df_agg_registered, df_agg_casual, how = 'outer', on = ['Start date'])

In [22]:
df_all_agg_daily = df_all_agg_daily.rename(columns = {'Start date': 'date', 
                                            'Registered': 'registered',
                                            'Casual': 'casual'})

In [23]:
df_all_agg_daily.head()

Unnamed: 0,date,registered,casual
0,2010-09-20,178,34
1,2010-09-21,215,109
2,2010-09-22,260,117
3,2010-09-23,249,124
4,2010-09-24,206,156


In [24]:
df_all_agg_daily.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2660 entries, 0 to 2659
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        2660 non-null   datetime64[ns]
 1   registered  2660 non-null   int64         
 2   casual      2660 non-null   int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 83.1 KB


In [25]:
df_all_agg_daily.isna().sum()

date          0
registered    0
casual        0
dtype: int64

In [26]:
df_all_agg_daily['year'] = df_all_agg_daily['date'].dt.year
df_all_agg_daily['month'] = df_all_agg_daily['date'].dt.month
df_all_agg_daily['day'] = df_all_agg_daily['date'].dt.day_name()

In [27]:
df_all_agg_daily['day'] = df_all_agg_daily['day'].apply(lambda x: x[0:3])

In [28]:
df_all_agg_daily.head()

Unnamed: 0,date,registered,casual,year,month,day
0,2010-09-20,178,34,2010,9,Mon
1,2010-09-21,215,109,2010,9,Tue
2,2010-09-22,260,117,2010,9,Wed
3,2010-09-23,249,124,2010,9,Thu
4,2010-09-24,206,156,2010,9,Fri


Great. So now we have a dataframe with the following columns: <br>
date, total registered users, total casual users, year, month, day

Now we want to create the same kind of data, but on an hourly basis.

In [29]:
df_all_years_hourly = concatenated_df.copy(deep = True)

In [30]:
df_all_years_hourly.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,1012,2010-09-20 11:27:04,2010-09-20 11:43:56,31208,M St & New Jersey Ave SE,31108,4th & M St SW,W00742,Member
1,61,2010-09-20 11:41:22,2010-09-20 11:42:23,31209,1st & N St SE,31209,1st & N St SE,W00032,Member
2,2690,2010-09-20 12:05:37,2010-09-20 12:50:27,31600,5th & K St NW,31100,19th St & Pennsylvania Ave NW,W00993,Member
3,1406,2010-09-20 12:06:05,2010-09-20 12:29:32,31600,5th & K St NW,31602,Park Rd & Holmead Pl NW,W00344,Member
4,1413,2010-09-20 12:10:43,2010-09-20 12:34:17,31100,19th St & Pennsylvania Ave NW,31201,15th & P St NW,W00883,Member


In [31]:
df_all_years_hourly['Start date'] = pd.to_datetime(df_all_years_hourly['Start date'])

In [32]:
df_all_years_hourly['End date'] = pd.to_datetime(df_all_years_hourly['End date'])

In [33]:
df_all_years_hourly['hour'] = df_all_years_hourly['Start date'].dt.hour

In [34]:
df_all_years_hourly.head(5)

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type,hour
0,1012,2010-09-20 11:27:04,2010-09-20 11:43:56,31208,M St & New Jersey Ave SE,31108,4th & M St SW,W00742,Member,11
1,61,2010-09-20 11:41:22,2010-09-20 11:42:23,31209,1st & N St SE,31209,1st & N St SE,W00032,Member,11
2,2690,2010-09-20 12:05:37,2010-09-20 12:50:27,31600,5th & K St NW,31100,19th St & Pennsylvania Ave NW,W00993,Member,12
3,1406,2010-09-20 12:06:05,2010-09-20 12:29:32,31600,5th & K St NW,31602,Park Rd & Holmead Pl NW,W00344,Member,12
4,1413,2010-09-20 12:10:43,2010-09-20 12:34:17,31100,19th St & Pennsylvania Ave NW,31201,15th & P St NW,W00883,Member,12


In [35]:
#df_all_years_hourly['Start date'] = df_all_years_hourly['Start date'].dt.date

Select rows where Member type == 'Member' (i.e. 'registered') <br>
Then do groupby + agg to find total number of registered users per hour

In [36]:
df_agg_registered_hourly = df_all_years_hourly [df_all_years_hourly['Member type'] == 'Member'].groupby(pd.Grouper(key = 'Start date', freq = 'H')).count().rename(columns={'Member type': 'Registered'})

In [37]:
df_agg_registered_hourly = df_agg_registered_hourly[['Registered']]

In [38]:
df_agg_registered_hourly.head()

Unnamed: 0_level_0,Registered
Start date,Unnamed: 1_level_1
2010-09-20 11:00:00,2
2010-09-20 12:00:00,17
2010-09-20 13:00:00,11
2010-09-20 14:00:00,4
2010-09-20 15:00:00,10


In [39]:
df_agg_registered_hourly.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 63829 entries, 2010-09-20 11:00:00 to 2017-12-31 23:00:00
Freq: H
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Registered  63829 non-null  int64
dtypes: int64(1)
memory usage: 997.3 KB


In [40]:
# This contains number of registered users for each day.
df_agg_registered_hourly

Unnamed: 0_level_0,Registered
Start date,Unnamed: 1_level_1
2010-09-20 11:00:00,2
2010-09-20 12:00:00,17
2010-09-20 13:00:00,11
2010-09-20 14:00:00,4
2010-09-20 15:00:00,10
...,...
2017-12-31 19:00:00,55
2017-12-31 20:00:00,30
2017-12-31 21:00:00,44
2017-12-31 22:00:00,21


Select rows where Member type == 'Casual'  <br>
Then do groupby + agg to find total number of casual users per hour

In [41]:
df_agg_casual_hourly = df_all_years_hourly [df_all_years_hourly['Member type'] == 'Casual'].groupby(pd.Grouper(key = 'Start date', freq = 'H')).count().rename(columns={'Member type': 'Casual'})


In [42]:
df_agg_casual_hourly = df_agg_casual_hourly [['Casual']]

In [43]:
df_agg_casual_hourly.head()

Unnamed: 0_level_0,Casual
Start date,Unnamed: 1_level_1
2010-09-20 14:00:00,2
2010-09-20 15:00:00,2
2010-09-20 16:00:00,5
2010-09-20 17:00:00,2
2010-09-20 18:00:00,3


In [44]:
df_agg_casual_hourly.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 63826 entries, 2010-09-20 14:00:00 to 2017-12-31 23:00:00
Freq: H
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Casual  63826 non-null  int64
dtypes: int64(1)
memory usage: 997.3 KB


In [45]:
# This contains number of casual users for each day.
df_agg_casual_hourly

Unnamed: 0_level_0,Casual
Start date,Unnamed: 1_level_1
2010-09-20 14:00:00,2
2010-09-20 15:00:00,2
2010-09-20 16:00:00,5
2010-09-20 17:00:00,2
2010-09-20 18:00:00,3
...,...
2017-12-31 19:00:00,4
2017-12-31 20:00:00,0
2017-12-31 21:00:00,2
2017-12-31 22:00:00,4


In [46]:
df_all_agg_hourly = pd.merge(df_agg_registered_hourly, df_agg_casual_hourly, how = 'outer', on = ['Start date'])

In [47]:
df_all_agg_hourly.head()
#df_all_agg_hourly[df_all_agg_hourly['Start date'] == '2011-01-01'].head(50)

Unnamed: 0_level_0,Registered,Casual
Start date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-09-20 11:00:00,2,
2010-09-20 12:00:00,17,
2010-09-20 13:00:00,11,
2010-09-20 14:00:00,4,2.0
2010-09-20 15:00:00,10,2.0


In [48]:
df_all_agg_hourly.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 63829 entries, 2010-09-20 11:00:00 to 2017-12-31 23:00:00
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Registered  63829 non-null  int64  
 1   Casual      63826 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [49]:
df_all_agg_hourly.reset_index(inplace = True)

In [50]:
df_all_agg_hourly.head()

Unnamed: 0,Start date,Registered,Casual
0,2010-09-20 11:00:00,2,
1,2010-09-20 12:00:00,17,
2,2010-09-20 13:00:00,11,
3,2010-09-20 14:00:00,4,2.0
4,2010-09-20 15:00:00,10,2.0


In [51]:
df_all_agg_hourly['hour'] = df_all_agg_hourly['Start date'].dt.hour

In [52]:
df_all_agg_hourly.head()

Unnamed: 0,Start date,Registered,Casual,hour
0,2010-09-20 11:00:00,2,,11
1,2010-09-20 12:00:00,17,,12
2,2010-09-20 13:00:00,11,,13
3,2010-09-20 14:00:00,4,2.0,14
4,2010-09-20 15:00:00,10,2.0,15


In [53]:
df_all_agg_hourly['Start date'] = df_all_agg_hourly['Start date'].dt.date

In [54]:
df_all_agg_hourly.head()

Unnamed: 0,Start date,Registered,Casual,hour
0,2010-09-20,2,,11
1,2010-09-20,17,,12
2,2010-09-20,11,,13
3,2010-09-20,4,2.0,14
4,2010-09-20,10,2.0,15


In [55]:
# But we know that NaN are all zeros. So:

df_all_agg_hourly['Registered'] = df_all_agg_hourly['Registered'].fillna(0)
df_all_agg_hourly['Casual'] = df_all_agg_hourly['Casual'].fillna(0)

In [56]:
df_all_agg_hourly.head()

Unnamed: 0,Start date,Registered,Casual,hour
0,2010-09-20,2,0.0,11
1,2010-09-20,17,0.0,12
2,2010-09-20,11,0.0,13
3,2010-09-20,4,2.0,14
4,2010-09-20,10,2.0,15


In [57]:
df_all_agg_hourly['Start date'] = pd.to_datetime(df_all_agg_hourly['Start date'])

In [58]:
df_all_agg_hourly['year'] = df_all_agg_hourly['Start date'].dt.year
df_all_agg_hourly['month'] = df_all_agg_hourly['Start date'].dt.month
df_all_agg_hourly['day'] = df_all_agg_hourly['Start date'].dt.day_name()
df_all_agg_hourly['day'] = df_all_agg_hourly['day'].apply(lambda x: x[0:3])

In [59]:
df_all_agg_hourly.head()

Unnamed: 0,Start date,Registered,Casual,hour,year,month,day
0,2010-09-20,2,0.0,11,2010,9,Mon
1,2010-09-20,17,0.0,12,2010,9,Mon
2,2010-09-20,11,0.0,13,2010,9,Mon
3,2010-09-20,4,2.0,14,2010,9,Mon
4,2010-09-20,10,2.0,15,2010,9,Mon


In [60]:
df_all_agg_hourly = df_all_agg_hourly.rename(columns = {'Start date': 'date',
                                    'Registered': 'registered',
                                    'Casual': 'casual'})

In [61]:
df_all_agg_daily.head()

Unnamed: 0,date,registered,casual,year,month,day
0,2010-09-20,178,34,2010,9,Mon
1,2010-09-21,215,109,2010,9,Tue
2,2010-09-22,260,117,2010,9,Wed
3,2010-09-23,249,124,2010,9,Thu
4,2010-09-24,206,156,2010,9,Fri


In [62]:
df_all_agg_daily ['year_month'] = df_all_agg_daily['date'].dt.to_period('M')
df_all_agg_hourly ['year_month'] = df_all_agg_hourly['date'].dt.to_period('M')

In [63]:
df_all_agg_daily.head()
df_all_agg_hourly.head()

Unnamed: 0,date,registered,casual,hour,year,month,day,year_month
0,2010-09-20,2,0.0,11,2010,9,Mon,2010-09
1,2010-09-20,17,0.0,12,2010,9,Mon,2010-09
2,2010-09-20,11,0.0,13,2010,9,Mon,2010-09
3,2010-09-20,4,2.0,14,2010,9,Mon,2010-09
4,2010-09-20,10,2.0,15,2010,9,Mon,2010-09


In [64]:
df_all_agg_daily.info()
df_all_agg_hourly.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2660 entries, 0 to 2659
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        2660 non-null   datetime64[ns]
 1   registered  2660 non-null   int64         
 2   casual      2660 non-null   int64         
 3   year        2660 non-null   int64         
 4   month       2660 non-null   int64         
 5   day         2660 non-null   object        
 6   year_month  2660 non-null   period[M]     
dtypes: datetime64[ns](1), int64(4), object(1), period[M](1)
memory usage: 166.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63829 entries, 0 to 63828
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        63829 non-null  datetime64[ns]
 1   registered  63829 non-null  int64         
 2   casual      63829 non-null  float64       
 3   hour        63829 non-nu

In [65]:
# Export dataframes

df_all_agg_daily.to_csv('cleaned_data/df_all_agg_daily.csv', index = False)
df_all_agg_hourly.to_csv('cleaned_data/df_all_agg_hourly.csv', index = False)



In [66]:
df_all_years.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,1012,2010-09-20 11:27:04,2010-09-20 11:43:56,31208,M St & New Jersey Ave SE,31108,4th & M St SW,W00742,Member
1,61,2010-09-20 11:41:22,2010-09-20 11:42:23,31209,1st & N St SE,31209,1st & N St SE,W00032,Member
2,2690,2010-09-20 12:05:37,2010-09-20 12:50:27,31600,5th & K St NW,31100,19th St & Pennsylvania Ave NW,W00993,Member
3,1406,2010-09-20 12:06:05,2010-09-20 12:29:32,31600,5th & K St NW,31602,Park Rd & Holmead Pl NW,W00344,Member
4,1413,2010-09-20 12:10:43,2010-09-20 12:34:17,31100,19th St & Pennsylvania Ave NW,31201,15th & P St NW,W00883,Member


In [67]:
df_all_years = df_all_years.rename(columns = {'Duration': 'duration',
                                'Start date': 'start_date',
                                'End date': 'end_date',
                                'Start station number': 'start_station_number',
                                'Start station': 'start_station',
                                'End station number': 'end_station_number',
                                'End station': 'end_station',
                                'Bike number': 'bike_number',
                                'Member type': 'member_type'
                                })

Want to export df_all_years (full data set) <br>
But the export / import is taking so long. <br>
Maybe work with just 3 years' data first. 

In [76]:
df_all_years.tail()

Unnamed: 0,duration,start_date,end_date,start_station_number,start_station,end_station_number,end_station,bike_number,member_type
19117638,277,2017-12-31 23:43:17,2017-12-31 23:47:54,31109,7th & T St NW,31118,3rd & Elm St NW,W21036,Member
19117639,399,2017-12-31 23:51:55,2017-12-31 23:58:35,31125,15th & W St NW,31110,20th St & Florida Ave NW,W23147,Casual
19117640,393,2017-12-31 23:55:19,2018-01-01 00:01:52,31209,1st & N St SE,31609,Maine Ave & 7th St SW,W20144,Member
19117641,1319,2017-12-31 23:57:42,2018-01-01 00:19:42,31102,11th & Kenyon St NW,31102,11th & Kenyon St NW,W20860,Member
19117642,266,2017-12-31 23:58:21,2018-01-01 00:02:48,31229,New Hampshire Ave & T St NW,31119,14th & Belmont St NW,W01459,Member


<h3> Quick comparison of missing addresses with unique addresses from 2017 </h3>

<h5> Starts here </h5>

In [77]:
df_2017 = df_all_years[df_all_years['start_date'].dt.year == 2017]

In [78]:
df_2017_stations = df_2017['start_station'].unique()

In [None]:
print(list(df_2017_stations))

In [91]:
missing_addresses = ['Crystal City Metro / 18th & Bell St', '21st & M St NW', 'Eastern Market Metro / Pennsylvania Ave & 7th St SE', 
'Connecticut Ave & Newark St NW / Cleveland Park',
'18th & Eads St.',
'19th & L St NW',
'23rd & Crystal Dr',
'Aurora Hills Community Ctr/18th & Hayes St',
'S Joyce & Army Navy Dr',
'Georgia Ave and Fairmont St NW',
'20th & Crystal Dr', 
'S Glebe & Potomac Ave',
'USDA / 12th & Independence Ave SW',
'27th & Crystal Dr',
'Pentagon City Metro / 12th & S Hayes St',
'12th & Army Navy Dr',
'26th & S Clark St',
'15th & Crystal Dr',
'Eads & 22nd St S',
'1st & N St  SE',
'Lynn & 19th St North',
'N Rhodes & 16th St N',
'Rosslyn Metro / Wilson Blvd & Ft Myer Dr',
'Wilson Blvd & Franklin Rd',
'11th & H St NE']

included = 0
missing = 0
for station in missing_addresses:
    if station in list(df_2017_stations):
        included += 1
    else:
        missing +=1

included, missing


(25, 0)

<h5> Ends here </h5>

In [69]:
df_all_years_shortened = df_all_years.copy(deep = True)

In [70]:
df_all_years_shortened = df_all_years_shortened [(df_all_years_shortened['start_date'].dt.year >= 2011) &(df_all_years_shortened['start_date'].dt.year <= 2013)]

In [71]:
df_all_years_shortened.head()
#df_all_years_shortened.tail()

Unnamed: 0,duration,start_date,end_date,start_station_number,start_station,end_station_number,end_station,bike_number,member_type
115597,3548,2011-01-01 00:01:29,2011-01-01 01:00:37,31620,5th & F St NW,31620,5th & F St NW,W00247,Member
115598,346,2011-01-01 00:02:46,2011-01-01 00:08:32,31105,14th & Harvard St NW,31101,14th & V St NW,W00675,Casual
115599,562,2011-01-01 00:06:13,2011-01-01 00:15:36,31400,Georgia & New Hampshire Ave NW,31104,Adams Mill & Columbia Rd NW,W00357,Member
115600,434,2011-01-01 00:09:21,2011-01-01 00:16:36,31111,10th & U St NW,31503,Florida Ave & R St NW,W00970,Member
115601,233,2011-01-01 00:28:26,2011-01-01 00:32:19,31104,Adams Mill & Columbia Rd NW,31106,Calvert & Biltmore St NW,W00346,Casual


In [72]:
df_all_years_shortened.to_csv('cleaned_data/df_all_years.csv', encoding = 'utf-8', index = False)

Run time: 32 s

In [74]:
df_2011 = df_all_years_shortened[df_all_years_shortened['start_date'].dt.year == 2011]
df_2012 = df_all_years_shortened[df_all_years_shortened['start_date'].dt.year == 2012]
df_2013 = df_all_years_shortened[df_all_years_shortened['start_date'].dt.year == 2013]

In [75]:
df_2011.to_csv('cleaned_data/df_2011.csv', encoding = 'utf-8', index = False)
df_2012.to_csv('cleaned_data/df_2012.csv', encoding = 'utf-8', index = False)
df_2013.to_csv('cleaned_data/df_2013.csv', encoding = 'utf-8', index = False)