# 1.2 Data Profiling Transit Data

### This script contains the following:

#### 1. Importing libraries and data
#### 2. Data Cleaning
#### 3. Reduce column years to 2018 to 2023
#### 4. Export Dataframes

## 1. Importing libraries and data

In [2]:
# import libraries

import pandas as pd
import numpy as np
import os

In [3]:
# set up path variable for easy import and export of data

path = '/Users/matthewmacbook/Documents/CareerFoundry/Data Immersion/Achievement 6 - Advanced Analytics and Dashboard Design/COVID-19 Public Transit Project'


In [4]:
low_memory = False

In [5]:
# import transit agency datasets

df_trips = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Yearly UPT Aug 2023.csv'))
df_vehicle_miles = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Yearly VRM Aug 2023.csv'))

In [6]:
df_trips.head()

Unnamed: 0,NTD ID,Legacy NTD ID,Agency,Status,Reporter Type,UACE CD,UZA Name,Mode,TOS,3 Mode,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",DR,PT,Bus,...,1012714.0,902626.0,870776.0,854155.0,883312.0,887915.0,455391.0,468104.0,552215.0,400306.0
1,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",DR,TN,Bus,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150541.0
2,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",DR,TX,Bus,...,96244.0,110018.0,110665.0,103861.0,143747.0,177791.0,86460.0,88944.0,110794.0,82622.0
3,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",FB,DO,Ferry,...,0.0,0.0,601942.0,599954.0,664365.0,701608.0,146930.0,286843.0,400407.0,286634.0
4,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",LR,DO,Rail,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_vehicle_miles

Unnamed: 0,NTD ID,Legacy NTD ID,Agency,Status,Reporter Type,UACE CD,UZA Name,Mode,TOS,3 Mode,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",DR,PT,Bus,...,8.675204e+06,6.705641e+06,7.934196e+06,7.802103e+06,8.335875e+06,8.511613e+06,3.913186e+06,4.022799e+06,4.968632e+06,3.582380e+06
1,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",DR,TN,Bus,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,5.293350e+05
2,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",DR,TX,Bus,...,8.056930e+05,1.257248e+06,1.336929e+06,1.411445e+06,1.675832e+06,2.052404e+06,9.052500e+05,9.349570e+05,1.461592e+06,1.089948e+06
3,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",FB,DO,Ferry,...,0.000000e+00,0.000000e+00,5.170400e+04,4.876200e+04,4.970600e+04,5.236200e+04,3.594800e+04,4.960600e+04,5.123600e+04,3.542600e+04
4,1.0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389.0,"Seattle--Tacoma, WA",LR,DO,Rail,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,,,,,,,,,Total,,...,4.008937e+09,4.061266e+09,4.109345e+09,4.137649e+09,4.156137e+09,4.212179e+09,3.367170e+09,3.499593e+09,3.635148e+09,2.519544e+09
2268,,,,,,,,,Reduced Reporters,,...,1.238944e+08,1.353223e+08,1.498180e+08,1.505530e+08,1.556585e+08,1.687136e+08,1.413450e+08,1.355173e+08,1.355173e+08,9.034485e+07
2269,,,,,,,,,Subtotal with Reduced Reporters,,...,4.132831e+09,4.196588e+09,4.259163e+09,4.288202e+09,4.311795e+09,4.380893e+09,3.508515e+09,3.635110e+09,3.770666e+09,2.609889e+09
2270,,,,,,,,,Rural Reporters,,...,4.825635e+08,4.901302e+08,5.248468e+08,5.340822e+08,5.351335e+08,5.196862e+08,4.341634e+08,4.095603e+08,4.095603e+08,2.730402e+08


## 2. Data Cleaning

In [8]:
df_trips.shape

(2272, 32)

In [9]:
df_vehicle_miles.shape

(2272, 32)

In [10]:
# Check for null values within each column
df_trips.isnull().sum()

NTD ID            6
Legacy NTD ID    78
Agency            6
Status            6
Reporter Type     6
UACE CD          12
UZA Name         12
Mode              6
TOS               1
3 Mode            6
2002              3
2003              3
2004              3
2005              3
2006              3
2007              2
2008              2
2009              2
2010              2
2011              2
2012              1
2013              1
2014              1
2015              1
2016              1
2017              1
2018              1
2019              1
2020              1
2021              1
2022              1
2023              1
dtype: int64

In [11]:
# Check for null values within each column
df_vehicle_miles.isnull().sum()

NTD ID            6
Legacy NTD ID    78
Agency            6
Status            6
Reporter Type     6
UACE CD          12
UZA Name         12
Mode              6
TOS               1
3 Mode            6
2002              3
2003              3
2004              3
2005              3
2006              3
2007              2
2008              2
2009              2
2010              2
2011              2
2012              1
2013              1
2014              1
2015              1
2016              1
2017              1
2018              1
2019              1
2020              1
2021              1
2022              1
2023              1
dtype: int64

In [12]:
df_trips_dropped = df_trips.dropna(subset=['UZA Name'], inplace=False)

In [13]:
df_vehicle_miles_dropped = df_vehicle_miles.dropna(subset=['UZA Name'], inplace=False)

In [14]:
df_trips_dropped.shape

(2260, 32)

In [15]:
df_vehicle_miles_dropped.shape

(2260, 32)

In [16]:
# Check for any additional null values
df_trips_dropped.isnull().sum()

NTD ID            0
Legacy NTD ID    72
Agency            0
Status            0
Reporter Type     0
UACE CD           0
UZA Name          0
Mode              0
TOS               0
3 Mode            0
2002              0
2003              0
2004              0
2005              0
2006              0
2007              0
2008              0
2009              0
2010              0
2011              0
2012              0
2013              0
2014              0
2015              0
2016              0
2017              0
2018              0
2019              0
2020              0
2021              0
2022              0
2023              0
dtype: int64

In [17]:
# Check for any additional null values
df_vehicle_miles_dropped.isnull().sum()

NTD ID            0
Legacy NTD ID    72
Agency            0
Status            0
Reporter Type     0
UACE CD           0
UZA Name          0
Mode              0
TOS               0
3 Mode            0
2002              0
2003              0
2004              0
2005              0
2006              0
2007              0
2008              0
2009              0
2010              0
2011              0
2012              0
2013              0
2014              0
2015              0
2016              0
2017              0
2018              0
2019              0
2020              0
2021              0
2022              0
2023              0
dtype: int64

In [18]:
df_trips_dropped.dtypes

NTD ID           float64
Legacy NTD ID     object
Agency            object
Status            object
Reporter Type     object
UACE CD          float64
UZA Name          object
Mode              object
TOS               object
3 Mode            object
2002             float64
2003             float64
2004             float64
2005             float64
2006             float64
2007             float64
2008             float64
2009             float64
2010             float64
2011             float64
2012             float64
2013             float64
2014             float64
2015             float64
2016             float64
2017             float64
2018             float64
2019             float64
2020             float64
2021             float64
2022             float64
2023             float64
dtype: object

In [19]:
# List of columns to convert to integers
columns_to_convert = ['NTD ID', 'UACE CD', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

# Iterate through the columns of both dataframes and convert them to integers
for column in columns_to_convert:
    df_trips_dropped[column] = pd.to_numeric(df_trips_dropped[column], errors='coerce', downcast='integer')
    df_vehicle_miles_dropped[column] = pd.to_numeric(df_vehicle_miles_dropped[column], errors='coerce', downcast='integer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trips_dropped[column] = pd.to_numeric(df_trips_dropped[column], errors='coerce', downcast='integer')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vehicle_miles_dropped[column] = pd.to_numeric(df_vehicle_miles_dropped[column], errors='coerce', downcast='integer')


In [20]:
df_trips_dropped

Unnamed: 0,NTD ID,Legacy NTD ID,Agency,Status,Reporter Type,UACE CD,UZA Name,Mode,TOS,3 Mode,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,PT,Bus,...,1012714,902626,870776,854155,883312,887915,455391,468104,552215,400306
1,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TN,Bus,...,0,0,0,0,0,0,0,0,0,150541
2,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TX,Bus,...,96244,110018,110665,103861,143747,177791,86460,88944,110794,82622
3,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",FB,DO,Ferry,...,0,0,601942,599954,664365,701608,146930,286843,400407,286634
4,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",LR,DO,Rail,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2261,99423,,City of Glendale,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,...,0,930377,1738122,1637300,1431136,1414940,529112,418715,624155,509462
2262,99424,,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,...,0,0,0,38773,86218,90957,38581,35436,38412,37337
2263,99424,,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,...,0,0,0,781755,1508413,1526844,957115,931575,1139100,710151
2264,99425,,Pomona Valley Transportation Authority,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,...,0,50926,99135,91053,84088,84110,30654,41567,57862,45972


In [21]:
df_vehicle_miles_dropped

Unnamed: 0,NTD ID,Legacy NTD ID,Agency,Status,Reporter Type,UACE CD,UZA Name,Mode,TOS,3 Mode,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,PT,Bus,...,8675204,6705641,7934196,7802103,8335875,8511613,3913186,4022799,4968632,3582380
1,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TN,Bus,...,0,0,0,0,0,0,0,0,0,529335
2,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TX,Bus,...,805693,1257248,1336929,1411445,1675832,2052404,905250,934957,1461592,1089948
3,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",FB,DO,Ferry,...,0,0,51704,48762,49706,52362,35948,49606,51236,35426
4,1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",LR,DO,Rail,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2261,99423,,City of Glendale,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,...,0,351753,699532,707528,720541,720907,735647,864334,868128,581234
2262,99424,,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,...,0,0,0,113379,242653,251734,157970,150265,136655,122088
2263,99424,,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,...,0,0,0,331203,708046,722336,734910,723915,701730,445970
2264,99425,,Pomona Valley Transportation Authority,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,...,0,147143,294035,265517,254522,255161,108487,241948,259825,162906


In [35]:
df_trips_dropped['UACE CD'] = df_trips_dropped['UACE CD'].astype(str).str.zfill(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trips_dropped['UACE CD'] = df_trips_dropped['UACE CD'].astype(str).str.zfill(5)


In [36]:
df_vehicle_miles_dropped['UACE CD'] = df_vehicle_miles_dropped['UACE CD'].astype(str).str.zfill(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vehicle_miles_dropped['UACE CD'] = df_vehicle_miles_dropped['UACE CD'].astype(str).str.zfill(5)


## 3. Reduce column years to 2018 to 2023

In [40]:
# Get list of column headers of trips dataframe
column_headers_trips = df_trips.columns.tolist()
print(column_headers_trips)

['NTD ID', 'Legacy NTD ID', 'Agency', 'Status', 'Reporter Type', 'UACE CD', 'UZA Name', 'Mode', 'TOS', '3 Mode', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']


In [41]:
# Get list of column headers of vehicle miles dataframe
column_headers_vehicle_miles = df_vehicle_miles.columns.tolist()
print(column_headers_vehicle_miles)

['NTD ID', 'Legacy NTD ID', 'Agency', 'Status', 'Reporter Type', 'UACE CD', 'UZA Name', 'Mode', 'TOS', '3 Mode', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']


In [42]:
# Define keep_columns as all columns excluding 2002-2017 columns
keep_columns = ['NTD ID', 'Agency', 'Status', 'Reporter Type', 'UACE CD', 'UZA Name', 'Mode', 'TOS', '3 Mode','2018', '2019', '2020', '2021', '2022', '2023']

# Set new dataframes to include on keep_columns
df_trips_new_headers = df_trips_dropped[keep_columns]
df_vehicle_miles_new_headers = df_vehicle_miles_dropped[keep_columns]

In [43]:
# reset indecies
df_trips_new_headers.reset_index(inplace = True, drop = True)
df_vehicle_miles_new_headers.reset_index(inplace = True, drop = True)

In [44]:
df_trips_new_headers.dtypes

NTD ID            int32
Agency           object
Status           object
Reporter Type    object
UACE CD          object
UZA Name         object
Mode             object
TOS              object
3 Mode           object
2018              int64
2019              int64
2020              int32
2021              int32
2022              int32
2023              int32
dtype: object

In [45]:
df_vehicle_miles_new_headers.dtypes

NTD ID            int32
Agency           object
Status           object
Reporter Type    object
UACE CD          object
UZA Name         object
Mode             object
TOS              object
3 Mode           object
2018              int32
2019              int32
2020              int32
2021              int32
2022              int32
2023              int32
dtype: object

## 4. Export Dataframes

In [46]:
df_trips_new_headers

Unnamed: 0,NTD ID,Agency,Status,Reporter Type,UACE CD,UZA Name,Mode,TOS,3 Mode,2018,2019,2020,2021,2022,2023
0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,PT,Bus,883312,887915,455391,468104,552215,400306
1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TN,Bus,0,0,0,0,0,150541
2,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TX,Bus,143747,177791,86460,88944,110794,82622
3,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",FB,DO,Ferry,664365,701608,146930,286843,400407,286634
4,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",LR,DO,Rail,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2255,99423,City of Glendale,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,1431136,1414940,529112,418715,624155,509462
2256,99424,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,86218,90957,38581,35436,38412,37337
2257,99424,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,1508413,1526844,957115,931575,1139100,710151
2258,99425,Pomona Valley Transportation Authority,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,84088,84110,30654,41567,57862,45972


In [30]:
df_trips_new_headers.shape

(2260, 15)

In [47]:
df_vehicle_miles_new_headers

Unnamed: 0,NTD ID,Agency,Status,Reporter Type,UACE CD,UZA Name,Mode,TOS,3 Mode,2018,2019,2020,2021,2022,2023
0,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,PT,Bus,8335875,8511613,3913186,4022799,4968632,3582380
1,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TN,Bus,0,0,0,0,0,529335
2,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",DR,TX,Bus,1675832,2052404,905250,934957,1461592,1089948
3,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",FB,DO,Ferry,49706,52362,35948,49606,51236,35426
4,1,King County Department of Metro Transit,Active,Full Reporter: Operating,80389,"Seattle--Tacoma, WA",LR,DO,Rail,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2255,99423,City of Glendale,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,720541,720907,735647,864334,868128,581234
2256,99424,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,242653,251734,157970,150265,136655,122088
2257,99424,City of Pasadena,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",MB,PT,Bus,708046,722336,734910,723915,701730,445970
2258,99425,Pomona Valley Transportation Authority,Active,Full Reporter: Operating,51445,"Los Angeles--Long Beach--Anaheim, CA",DR,PT,Bus,254522,255161,108487,241948,259825,162906


In [32]:
df_vehicle_miles_new_headers.shape

(2260, 15)

In [48]:
# Export trips data frame into csv file in clean data folder as 'trips_2018_to_2023.csv'
df_trips_new_headers.to_csv(os.path.join(path, 'Datasets', 'Clean Data', 'trips_2018_to_2023.csv'), index=False)

In [49]:
# Export vehicle miles data frame into csv file in clean data folder as 'vehicle_miles_2018_to_2023.csv'
df_vehicle_miles_new_headers.to_csv(os.path.join(path, 'Datasets', 'Clean Data', 'vehicle_miles_2018_to_2023.csv'), index=False)