In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_columns', None)

Load, clean, and write back to disk.

In [2]:
# Read files from directory.
files = [x for x in os.listdir('data') if '_data_only' in x]

# Sort by earlier year first.
files = list(sorted(files, key=lambda x: int(x[:4])))

print('File names:', files)

# Load into memory.
dfs = [
    pd.read_csv('data/' + file_name, delimiter=',', quotechar='"', encoding='ISO-8859-1')
    for file_name in files
]

# Clean up column names for consistency. 
for i, dframe in enumerate(dfs):
    dfs[i].columns = [
        x \
            .strip() \
            .lower() \
            .replace(' ', '_') \
            .replace('\n', '_') \
            .replace('ï»¿agency', 'agency')
        for x in dframe.columns
    ]

# Check for columns that are not shared between dataframes.
cols = [set(dframe.columns) for dframe in dfs]

cols_not_shared = []
common_columns_across_dfs = set.intersection(*cols)
for i, col_set in enumerate(cols):
    other_cols = set.union(*(cols[: i] + cols[i + 1 :])) 
    unique_to_this_df = col_set - other_cols  
    missing_from_this_df = other_cols - col_set  
    cols_not_shared.extend(unique_to_this_df)

for i, df in enumerate(dfs):
    for col in cols_not_shared:
        if col in df.columns:
            print(f'Dropping {col} because col is not shared.')
            dfs[i] = df.drop(col, axis=1)

# Add year to each dataframe.
for i, year in enumerate(range(2019, 2023)):
    dfs[i]['year'] = year

# Combine into one dataframe.
df = pd.DataFrame()
for dframe in dfs:
    df = pd.concat([df, dframe], axis=0)

# Drop cols with many NaN values.
for col in df.columns:
    if df[col].isna().sum() / len(df) > 0.9:
        print(f'Dropping {col} because % NaN is: {df[col].isna().sum() / len(df)}')
        df = df.drop(col, axis=1)

# Remove extra characters from numeric columns and cast to float.
numeric_cols = [
    'cost_per_hour',
    'primary_uza_population',
    'vehicle_revenue_miles',
    'passenger_miles',
    'vehicle_revenue_hours',
    'unlinked_passenger_trips',
    'total_operating_expenses',
    'fare_revenues_earned',
    'cost_per_passenger_mile',
    'cost_per_passenger',
    'passengers_per_hour',
    'cost_per__hour',
    'fare_revenues_per_total_operating_expense_(recovery_ratio)',
    'primary_uza__population',
    'agency_voms',
    'mode_voms',
    'fare_revenues_per_unlinked_passenger_trip'
]

for col in numeric_cols:
    df[col] = df[col].apply(lambda x: (
        str(x) \
            .replace('$', '') \
            .replace('(', '') \
            .replace(')', '') \
            .replace(',', '')
            .strip()
    ))
    df[col] = df[col].astype(float)

display(df.sample(1, random_state=538))

df.to_csv('data/clean_metrics_19_22.csv', index=False)

File names: ['2019_Metrics_data_only.csv', '2020-Metrics_data_only.csv', '2021 Metrics_static_data_only.csv', '2022_NTD_Annual_Data_-_Metrics_20231102_data_only.csv']
Dropping 0 because col is not shared.
Dropping uace_code because col is not shared.
Dropping cost_per_hour because col is not shared.
Dropping primary_uza_population because col is not shared.
Dropping uza_name because col is not shared.
Dropping passengers_per_vehicle_revenue_hour because col is not shared.
Dropping ratios: because % NaN is: 1.0
Dropping fare_revenues_per_unlinked_passenger_trip_questionable because % NaN is: 0.985341799513645
Dropping fare_revenues_per_total_operating_expense_(recovery_ratio)_questionable because % NaN is: 1.0
Dropping cost_per_hour_questionable because % NaN is: 0.9909483923263983
Dropping passengers_per_hour_questionable because % NaN is: 0.9800054039448798
Dropping cost_per_passenger_questionable because % NaN is: 0.985341799513645
Dropping cost_per_passenger_mile_questionable becaus

Unnamed: 0,agency,city,state,legacy_ntd_id,ntd_id,organization_type,reporter_type,primary_uza__population,agency_voms,mode,tos,mode_voms,fare_revenues_per_unlinked_passenger_trip,fare_revenues_per_total_operating_expense_(recovery_ratio),cost_per__hour,passengers_per_hour,cost_per_passenger,cost_per_passenger_mile,fare_revenues_earned,total_operating_expenses,unlinked_passenger_trips,vehicle_revenue_hours,passenger_miles,vehicle_revenue_miles,any_data_questionable?,year,uace_code,uza_name,primary_uza_population,cost_per_hour
94,"Pace-Suburban Bus Division, ADA Paratransit Se...",Arlington Heights,IL,5182,50182,Independent Public Agency or Authority of Tran...,Full Reporter,8608208.0,1217.0,DR,TX,370.0,2.56,0.05,94.96,2.0,47.56,3.96,425141.0,7888853.0,165883.0,83073.0,1991800.0,1535075.0,No,2020,,,,
