In [None]:
import pandas as pd
from datetime import datetime
import os

In [None]:
def get_columns(df):
    columns = ['Departure']
    if 'Electric bike' in df:
        columns += ['Electric bike']
    columns += ['Covered distance (m)','Duration (sec.)']
    for c in df:
        if c.startswith('Departure temperature') or c.startswith('Return temperature'):
            columns += [c]
    return columns

In [None]:
def operations(cols):
    ops = {'Departure':'count',
           'Covered distance (m)':'sum',
           'Duration (sec.)':'sum'}
    if 'Electric bike' in cols:
        ops['Electric bike'] = 'sum'
    for c in cols:
        if c.startswith('Departure temperature') or c.startswith('Return temperature'):
            ops[c] = 'mean'
    return ops

In [None]:
def rename(cols):
    names = {'Departure':'Bike trips',
             'Covered distance (m)':'Total distance (m)',
             'Duration (sec.)':'Total duration (sec)',}
    if 'Electric bike' in cols:
        names['Electric bike'] = 'Electric bike trips'
    for c in cols:
        if c.startswith('Departure temperature'):
            names[c] = 'Mean departure temperature (C)'
        if c.startswith('Return temperature'):
            names[c] = 'Mean return temperature (C)'
    return names

In [None]:
def get_date(d):
    return d.date()

In [None]:
aggregate_data = pd.DataFrame({})

files = [file_name for file_name in os.listdir('raw data') if file_name.endswith('.csv')]

for file_name in files:
    raw_data = pd.read_csv('raw data/'+file_name)
    print(file_name)
    cols = get_columns(raw_data)
    proc_data = raw_data[cols].copy().dropna()
    
    example_date = proc_data.loc[0,'Departure']
    if '-' in example_date:
        date_format = '%Y-%m-%d %H:%M'
    elif len(example_date.split()[0].split('/')[2])==2:
        date_format = '%m/%d/%y %H:%M'
    else:
        date_format = '%m/%d/%Y %H:%M'
    
    proc_data['Departure'] = proc_data['Departure'].apply(lambda x : datetime.strptime(x, date_format))
    proc_data = proc_data.groupby(proc_data['Departure'].apply(get_date)).agg(operations(cols))
    proc_data.index.name = 'Date'
    proc_data.rename(columns=rename(cols), inplace=True)
    aggregate_data = pd.concat([aggregate_data, proc_data], ignore_index = False)

aggregate_data.sort_index(ascending=True, inplace=True)

In [None]:
aggregate_data

In [None]:
aggregate_data.info()

In [None]:
aggregate_data.describe()