In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#install required packages
import numpy as np
import pandas as pd
from plotnine import *
from datetime import datetime

In [None]:
def clean_csv(file_path):
    # Read CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Keep only the required columns
    df = df[['route_id', 'stop_id', 'direction_id', 'half_trip_id', 'point_type', 'scheduled', 'actual']]

    # Keep only rows where 'point_type' is 'Startpoint' or 'Endpoint'
    df = df[df['point_type'].isin(['Startpoint', 'Endpoint'])]

    # Order rows by 'half_trip_id' first, then by 'point_type' so that 'Startpoint' comes before 'Endpoint'
    df = df.sort_values(by=['half_trip_id', 'point_type'], ascending=[True, False])

    # Convert time format in 'scheduled' and 'actual' columns
    df['scheduled'] = pd.to_datetime(df['scheduled']).dt.strftime('%H:%M:%S')
    df['actual'] = pd.to_datetime(df['actual']).dt.strftime('%H:%M:%S')

    return df


In [None]:
def save_cleaned_csv(df, month):
    output_file = f'/content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/clean_MBTA_{month:02d}.csv'
    df.to_csv(output_file, index=False)
    print(f'Cleaned file saved as: {output_file}')

In [None]:
# Main function to process all CSV files
def process_all_files():
    for month in range(1, 13):
        input_file = f"/content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/MBTA-Bus-Arrival-Departure-Times_2022-{month:02d}.csv"
        print(f'Cleaning file: {input_file}')
        cleaned_df = clean_csv(input_file)
        save_cleaned_csv(cleaned_df, month)

# Process all files
process_all_files()

Cleaning file: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/MBTA-Bus-Arrival-Departure-Times_2022-01.csv
Cleaned file saved as: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/clean_MBTA_01.csv
Cleaning file: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/MBTA-Bus-Arrival-Departure-Times_2022-02.csv
Cleaned file saved as: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/clean_MBTA_02.csv
Cleaning file: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/MBTA-Bus-Arrival-Departure-Times_2022-03.csv
Cleaned file saved as: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/clean_MBTA_03.csv
Cleaning file: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/MBTA-Bus-Arrival-Departure-Times_2022-04.csv
Cleaned file saved as: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/clean_MBTA_04.csv
Cleaning file: /content/drive/My Drive/WORK/SI_BusTransit_Project/Datasets/MBTA-Bus-Arrival-Departure-Times_2022-05.