# Creating aggregate bike data

In [None]:
import pandas as pd
from datetime import datetime, date
import os

In [None]:
# Looks at monthly bike data and selects relevant columns

def get_columns(df):
    columns = ['Departure']
    if 'Electric bike' in df:
        columns += ['Electric bike']
    columns += ['Covered distance (m)','Duration (sec.)']
    for c in df:
        if c.startswith('Departure temperature') or c.startswith('Return temperature'):
            columns += [c]
    return columns

In [None]:
# Associates an operation to perform on each column

def operations(cols):
    ops = {'Departure':'count',
           'Covered distance (m)':'sum',
           'Duration (sec.)':'sum'}
    if 'Electric bike' in cols:
        ops['Electric bike'] = 'sum'
    for c in cols:
        if c.startswith('Departure temperature') or c.startswith('Return temperature'):
            ops[c] = 'mean'
    return ops

In [None]:
# Specifies new columnn names to be used after
# operations have been applied

def rename(cols):
    names = {'Departure':'Bike trips',
             'Covered distance (m)':'Total distance (m)',
             'Duration (sec.)':'Total duration (sec)',}
    if 'Electric bike' in cols:
        names['Electric bike'] = 'Electric bike trips'
    for c in cols:
        if c.startswith('Departure temperature'):
            names[c] = 'Mean departure temperature (C)'
        if c.startswith('Return temperature'):
            names[c] = 'Mean return temperature (C)'
    return names

In [None]:
def get_date(d):
    return d.date()

In [None]:
aggregate_data = pd.DataFrame({})

# names of monthly data files
files = [file_name for file_name in os.listdir('raw data') if file_name.endswith('.csv')]

for file_name in files:
    # Load a monthly data file
    raw_data = pd.read_csv('raw data/'+file_name)
    print(file_name)

    # Identify relevant columns
    cols = get_columns(raw_data)
    proc_data = raw_data[cols].copy().dropna()

    # Convert 'Departure' strings into datetime objects
    example_date = proc_data.loc[0,'Departure']
    if '-' in example_date:
        date_format = '%Y-%m-%d %H:%M'
    elif len(example_date.split()[0].split('/')[2])==2:
        date_format = '%m/%d/%y %H:%M'
    else:
        date_format = '%m/%d/%Y %H:%M'   
    proc_data['Departure'] = proc_data['Departure'].apply(lambda x : datetime.strptime(x, date_format))

    # Group data by date (in 'Departure' column)
    # and apply an operation (e.g. sum, mean) to each column
    proc_data = proc_data.groupby(proc_data['Departure'].apply(get_date)).agg(operations(cols))
    proc_data.index.name = 'Date'

    # Rename columns after grouping and transforming
    proc_data.rename(columns=rename(cols), inplace=True)

    # Add the processed monthly data to the aggregate DataFrame
    aggregate_data = pd.concat([aggregate_data, proc_data], ignore_index = False)

aggregate_data.sort_index(ascending=True, inplace=True)

We now have a DataFrame containing all bike data compressed to a daily scale.  Note that the first day of each month appears in the raw data file for that month *and* the preceding one.  (E.g.  There are April 1, 2024 bike trips logged in both the April 2024 and March 2024 files.) So our aggregate DataFrame will have a repeated index at the first of each month.  For example:

In [None]:
aggregate_data.loc[date(2024,4,1)]

We can combine these rows by summing the `Bike trips`, `Total distance (m)`, `Total duration`, and `Electric bike trips` columns and taking a weighted average in the `Mean departure temperature (C)` and `Mean return temperature (C)` columns.

In [None]:
# Switch to integer indexing
aggregate_data.reset_index(inplace=True)

# Get locations of repeated dates
bad_indices = [i for i in range(len(aggregate_data.index)-1)
               if aggregate_data.loc[i,'Date'] == aggregate_data.loc[i+1,'Date']]

for i in bad_indices:
    # Get data from both rows
    trips1, trips2 = aggregate_data.loc[[i,i+1],'Bike trips']
    dist1, dist2 = aggregate_data.loc[[i,i+1],'Total distance (m)']
    dur1, dur2 = aggregate_data.loc[[i,i+1],'Total duration (sec)']
    dep_temp1, dep_temp2 = aggregate_data.loc[[i,i+1],'Mean departure temperature (C)']
    ret_temp1, ret_temp2 = aggregate_data.loc[[i,i+1],'Mean return temperature (C)']
    elec1, elec2 = aggregate_data.loc[[i,i+1],'Electric bike trips']

    # Record new combined data in the first row
    aggregate_data.loc[i,'Bike trips'] = trips1+trips2
    aggregate_data.loc[i,'Total distance (m)'] = dist1+dist2
    aggregate_data.loc[i,'Total duration (sec)'] = dur1+dur2
    aggregate_data.loc[i,'Mean departure temperature (C)'] = (trips1*dep_temp1 + trips2*dep_temp2)/(trips1+trips2)
    aggregate_data.loc[i,'Mean return temperature (C)'] = (trips1*ret_temp1 + trips2*ret_temp2)/(trips1+trips2)
    aggregate_data.loc[i,'Electric bike trips'] = elec1+elec2

# Drop all of the 'second' rows
aggregate_data.drop([i+1 for i in bad_indices],inplace=True)

# Switch back to date indexing
aggregate_data.set_index('Date', inplace=True)

Now we have:

In [None]:
aggregate_data.loc[date(2024,4,1)]

In [None]:
any(aggregate_data.index.duplicated())