In [1]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join

In [7]:
DATA_FOLDER = "../data/raw_mobibikes_data/"
PREFIX = "Mobi_System_Data_"
VERBOSE = True

In [58]:
test_file = "2017.xlsx"
df = pd.read_excel("../data/" + PREFIX + test_file)

In [60]:
df.to_csv("../data/" + PREFIX + "2017.csv", compression="zip")

In [3]:
raw_files = [f for f in listdir(DATA_FOLDER) if isfile(join(DATA_FOLDER, f))]

In [50]:
y = 2021 # yrs = [2018, 2019, 2020, 2021]

files = [f for f in raw_files if str(y) in f and r'~' not in f]
files.sort()

output_file_name = PREFIX + str(y) + ".csv"

In [51]:
dfs = []

# Read dataframes of this year
for f in files:
    ext = f.split('.')[1]
    
    if VERBOSE:
        print("Processing " + f + " ...")
    
    if ext == "csv":
        df = pd.read_csv(DATA_FOLDER + f)
    elif ext == "xlsx":
        df = pd.read_excel(DATA_FOLDER + f)
    else:
        raise ValueError

    dfs.append(df)

Processing Mobi_System_Data_2021-01.csv ...
Processing Mobi_System_Data_2021-02.csv ...
Processing Mobi_System_Data_2021-03.csv ...
Processing Mobi_System_Data_2021-04.csv ...
Processing Mobi_System_Data_2021-05.csv ...
Processing Mobi_System_Data_2021-06.csv ...
Processing Mobi_System_Data_2021-07.csv ...


In [52]:
for df in dfs:
    print(len(df.columns))

14
14
14
14
14
14
14


In [36]:
dfs[-2].columns

Index(['Departure', 'Return', 'Bike', 'Departure station', 'Return station',
       'Membership type', 'Covered distance (m)', 'Duration (sec.)',
       'Departure temperature (C)', 'Return temperature (C)', 'Departure slot',
       'Return slot', 'Stopover duration (sec.)', 'Number of stopovers'],
      dtype='object')

In [53]:
columns = None

# Clean column names
for df in dfs:
    # Clean jobs
    df.drop(columns=[n for n in df.columns if "slot" in n], inplace=True)
    df.drop(columns=[n for n in df.columns if "Manager" in n], inplace=True)
    
    if columns is None:
        columns = df.columns
    else:
        try:
            assert(len(df.columns) <= len(columns))
        except AssertionError as e:
            print(len(df.columns), df.columns)
            print(len(columns), columns)
            raise e
            
        col_mapper = {a: b for a, b in zip(df.columns, columns)}
        df.rename(columns=col_mapper, inplace=True)

        df.reset_index(inplace=True, drop=True)

In [54]:
# Perform concatenation
res_df = pd.concat(dfs, ignore_index=True)

In [55]:
res_df

Unnamed: 0,Departure,Return,Bike,Departure station,Return station,Membership type,Covered distance (m),Duration (sec.),Departure battery voltage (mV),Return battery voltage (mV),Departure temperature (C),Return temperature (C),Stopover duration (sec.),Number of stopovers
0,2021-02-01 0:00,2021-02-01 0:00,1274.0,0001 10th & Cambie,0074 10th & Main,365 Plus,951.0,259,3590,3790,7,9,0,0
1,2021-02-01 0:00,2021-02-01 0:00,1149.0,0063 Robson & Granville,0063 Robson & Granville,30 Day Pass,2726.0,1200,3636,3764,7,5,0,0
2,2021-02-01 0:00,2021-02-01 0:00,1950.0,0002 Burrard Station (Melville & Dunsmuir),0096 Nicola & Robson,365 Standard,1139.0,274,3911,4047,8,10,0,0
3,2021-02-01 0:00,2021-02-01 0:00,304.0,0096 Nicola & Robson,0032 Comox & Denman,365 Day Founding Plus,2527.0,870,3985,4140,8,10,194,1
4,2021-01-31 23:00,2021-01-31 23:00,591.0,0045 Beach & Hornby,0161 Alexander & Dunlevy,365 Standard,4406.0,882,3905,4206,9,10,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399767,2021-07-01 0:00,2021-07-01 0:00,1169.0,0014 Canada Place,0063 Robson & Granville,24 Hour,1196.0,823,3997,4064,22,22,0,0
399768,2021-07-01 0:00,2021-07-01 0:00,1892.0,0033 Robson & Denman,0083 Aquatic Centre,365 Standard,1890.0,477,3924,4085,23,23,0,0
399769,2021-07-01 0:00,2021-07-01 0:00,162.0,0014 Canada Place,0063 Robson & Granville,24 Hour,1208.0,870,3840,3953,22,21,0,0
399770,2021-07-01 0:00,2021-07-01 0:00,1997.0,0103 Stanley Park - Third Beach Parking Lot,0207 Alberni & Jervis,24 Hour,4418.0,1671,4045,4177,22,24,0,0


In [56]:
res_df.to_csv("../data/" + output_file_name, compression="zip")