<h2> Overview </h2>

- We want to create a dataframe for each of the years from 2011 to 2019.
- Each dataframe will contain the following columns: 
- Name of active station
- Region
- Longitude
- Latitude

In [6]:
import pandas as pd
import numpy as np

In [4]:
# load dataframes
station_loc = pd.read_csv('cleaned_data/station_loc.csv')

df_2011 = pd.read_csv('cleaned_data/df_2011.csv', parse_dates = ['start_date', 'end_date'])
df_2012 = pd.read_csv('cleaned_data/df_2012.csv', parse_dates = ['start_date', 'end_date'])
df_2013 = pd.read_csv('cleaned_data/df_2013.csv', parse_dates = ['start_date', 'end_date'])
df_2014 = pd.read_csv('cleaned_data/df_2014.csv', parse_dates = ['start_date', 'end_date'])
df_2015 = pd.read_csv('cleaned_data/df_2015.csv', parse_dates = ['start_date', 'end_date'])
df_2016 = pd.read_csv('cleaned_data/df_2016.csv', parse_dates = ['start_date', 'end_date'])
df_2017 = pd.read_csv('cleaned_data/df_2017.csv', parse_dates = ['start_date', 'end_date'])
df_2018 = pd.read_csv('cleaned_data/df_2018.csv', parse_dates = ['start_date', 'end_date'])
df_2019 = pd.read_csv('cleaned_data/df_2019.csv', parse_dates = ['start_date', 'end_date'])

- run time: 42 s

- For each year, we want to create a separate dataframe, with the columns "station", "lat", "lon"

In [9]:
# create a list of all dataframes per year
dataframes = [df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019]


# For each dataframe df_20xx, create a new dataframe with a single column
# The column includes all docking stations that were active that year

stations = []
for df in dataframes:
    # retrieve all start stations and end stations, then save the union as "both"
    start = df['start_station'].unique()
    end = df['end_station'].unique()
    both = np.union1d(start,end)
    stations.append(both)

stations_dataframes = []

for item in stations: 
    stations_dataframes.append(pd.DataFrame(item).rename(columns = {0: 'station'}))


# Now for every stations_dataframe, get the "region", "lat" and "lon" columns from the station_loc dataframe

for item in stations_dataframes:
    # Initialise with zeros
    item['region'] = np.zeros(item.shape[0])
    item['lat'] = np.zeros(item.shape[0])
    item['lon'] = np.zeros(item.shape[0])

    for i in item.index:
        for j in station_loc.index: 
            if item.loc[i,'station'] == station_loc.loc[j, 'name']:
                item.loc[i, 'lat'] = station_loc.loc[j, 'lat']
                item.loc[i, 'lon'] = station_loc.loc[j, 'lon']
                item.loc[i, 'region'] = station_loc.loc[j, 'region']

# Assign each dataframe to variable
stations_2011 = stations_dataframes[0]
stations_2012 = stations_dataframes[1]
stations_2013 = stations_dataframes[2]
stations_2014 = stations_dataframes[3]
stations_2015 = stations_dataframes[4]
stations_2016 = stations_dataframes[5]
stations_2017 = stations_dataframes[6]
stations_2018 = stations_dataframes[7]
stations_2019 = stations_dataframes[8]


- run time: 26 s

In [10]:
stations_2019.head()

Unnamed: 0,station,region,lat,lon
0,10th & E St NW,"Washington, DC",38.895914,-77.026064
1,10th & Florida Ave NW,"Washington, DC",38.920387,-77.025672
2,10th & G St NW,"Washington, DC",38.898243,-77.026235
3,10th & K St NW,"Washington, DC",38.9024,-77.02622
4,10th & Monroe St NE,"Washington, DC",38.932457,-76.993534


In [11]:
# Save dataframes as csv files
stations_2011.to_csv('cleaned_data/stations_2011.csv', index = False)
stations_2012.to_csv('cleaned_data/stations_2012.csv', index = False)
stations_2013.to_csv('cleaned_data/stations_2013.csv', index = False)
stations_2014.to_csv('cleaned_data/stations_2014.csv', index = False)
stations_2015.to_csv('cleaned_data/stations_2015.csv', index = False)
stations_2016.to_csv('cleaned_data/stations_2016.csv', index = False)
stations_2017.to_csv('cleaned_data/stations_2017.csv', index = False)
stations_2018.to_csv('cleaned_data/stations_2018.csv', index = False)
stations_2019.to_csv('cleaned_data/stations_2019.csv', index = False)
