In [146]:
!pip install pandas



In [183]:
import pandas as pd

## Bike data validation

def get_data(data_path: str):
    return pd.read_csv(data_path)


def drop_columns(dataframe: pd.DataFrame, columns_to_drop: list):
    dataframe.drop(columns=columns_to_drop, inplace=True)
    return dataframe


def replace_chars_on_column(dataframe: pd.DataFrame):
    string_list_to_replace_with_blank = [".", "(", ")"]
    dataframe.columns = dataframe.columns.str.replace(' ', '_', regex=True).str.lower()
    for char in string_list_to_replace_with_blank:
        dataframe.columns = dataframe.columns.str.replace(char, '', regex=True)

    return dataframe


def change_column_data_type(dataframe: pd.DataFrame):
    dataframe.covered_distance_m = dataframe.covered_distance_m.astype(int)
    dataframe.duration_sec = dataframe.duration_sec.astype(int)
    return dataframe


def change_column_names(dataframe, columns_to_change: dict):
    dataframe.rename(columns=columns_to_change, inplace=True)
    return dataframe


def drop_distances_and_durations_under_10(dataframe: pd.DataFrame):
    print("Original dataframe shape: ", dataframe.shape)
    print(
        f"Number of rows dropped: {len(dataframe) - len(dataframe[(dataframe.covered_distance_m >= 10) & (dataframe.duration_sec >= 10)])}")
    dataframe = dataframe[(dataframe.covered_distance_m >= 10) & (dataframe.duration_sec >= 10)].copy()

    return change_column_data_type(dataframe)


def concat_dataframes(array_of_dataframes: list):
    return pd.concat(array_of_dataframes).reset_index(drop=True).reset_index(drop=True)


def dataframe_to_csv(dataframe: pd.DataFrame, filename: str):
    dataframe.to_csv(f"{filename}_new.csv", index=False)


def sort_by_column(dataframe: pd.DataFrame, sort_by: str):
    return dataframe.sort_values(by=[sort_by], ascending=True).reset_index(drop=True)

In [180]:
dataframe_1 = drop_columns(
    dataframe=drop_distances_and_durations_under_10(replace_chars_on_column(get_data('./data/2021-05.csv'))),
    columns_to_drop=['departure_station_name', 'return_station_name'])
dataframe_2 = drop_columns(
    dataframe=drop_distances_and_durations_under_10(replace_chars_on_column(get_data('./data/2021-06.csv'))),
    columns_to_drop=['departure_station_name', 'return_station_name'])
dataframe_3 = drop_columns(
    dataframe=drop_distances_and_durations_under_10(replace_chars_on_column(get_data('./data/2021-07.csv'))),
    columns_to_drop=['departure_station_name', 'return_station_name'])

merged_dataframe = concat_dataframes([dataframe_1, dataframe_2, dataframe_3])

merged_dataframe = change_column_names(merged_dataframe,
                                       columns_to_change={'covered_distance_m': 'distance_m',
                                                          'duration_sec': 'duration_s',
                                                          "departure_station_id": "start_station_id",
                                                          "return_station_id": "end_station_id",
                                                          "departure": "start_time",
                                                          "return": "end_time"})
merged_dataframe = sort_by_column(merged_dataframe, 'start_time')

dataframe_to_csv(merged_dataframe, './data/helsinki_bike_data')

Original dataframe shape:  (814676, 8)
Number of rows dropped: 29378
Original dataframe shape:  (1223482, 8)
Number of rows dropped: 44642
Original dataframe shape:  (1208844, 8)
Number of rows dropped: 44224


Unnamed: 0,start_time,end_time,start_station_id,end_station_id,distance_m,duration_s
0,2021-05-01T00:00:11,2021-05-01T00:04:34,138,138,1057,259
1,2021-05-01T00:00:11,2021-05-01T00:04:34,138,138,1057,259
2,2021-05-01T00:00:30,2021-05-01T00:09:53,17,45,1688,558
3,2021-05-01T00:00:30,2021-05-01T00:09:53,17,45,1688,558
4,2021-05-01T00:00:30,2021-05-01T00:11:55,25,123,2088,679
...,...,...,...,...,...,...
3128753,2021-07-31T23:59:55,2021-08-01T00:08:45,135,115,1307,532
3128754,2021-07-31T23:59:55,2021-08-01T00:08:45,135,115,1307,532
3128755,2021-07-31T23:59:55,2021-08-01T00:03:24,258,260,820,205
3128756,2021-07-31T23:59:59,2021-08-01T00:09:15,113,78,1602,553


In [190]:

station_data = drop_columns(
    replace_chars_on_column(dataframe=get_data('./data/station_data.csv')),
    columns_to_drop=["nimi", "fid", "stad", "operaattor", "namn", "kapasiteet", "adress", "kaupunki"])
station_data = change_column_names(
    station_data, {"id": "station_id", "osoite": "address", "y": "latitude", "x": "longitude"}
)
station_data = sort_by_column(dataframe=station_data, sort_by="station_id")
dataframe_to_csv(station_data, "./data/station_data")
station_data


Unnamed: 0,station_id,name,address,longitude,latitude
0,1,Kaivopuisto,Meritori 1,24.950211,60.155370
1,2,Laivasillankatu,Laivasillankatu 14,24.956510,60.160989
2,3,Kapteeninpuistikko,Tehtaankatu 13,24.945018,60.158177
3,4,Viiskulma,Fredrikinkatu 19,24.941776,60.160986
4,5,Sepänkatu,Tehtaankatu 25,24.936285,60.157948
...,...,...,...,...,...
452,767,Ruutikatu,Ruutikatu 10,24.820635,60.223377
453,769,Tiurintie,Kurkijoentie 1,24.814562,60.204951
454,900,Orionintie,Orionintie 3-5,24.771826,60.180118
455,901,O'Bike Station,Rauhalanpuisto 11,24.742219,60.163412
