# 1.1 Data Profiling Commuter Datasets

### This script contains the following:

#### 1. Importing libraries and data
#### 2. Combine all Commute Dataframes Together
#### 3. Split Dataframe into Different Category Dataframes
#### 4. Reformat Means of Transportation Dataframe Columns
#### 5. Reformat Place of Work Dataframe 
#### 6. Reformat Departure Time Dataframes
#### 7. Reformat Travel Time Dataframe
#### 8. Reformat Household Vehicles 
#### 9. Export Categories Dataframes

## 1. Importing libraries and data

In [1]:
# import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# set up path variable for easy import and export of data

path = '/Users/matthewmacbook/Documents/CareerFoundry/Data Immersion/Achievement 6 - Advanced Analytics and Dashboard Design/COVID-19 Public Transit Project'


In [3]:
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)


In [4]:
low_memory = False

In [5]:
# import commuter datasets and transit agency datasets

df_commute_2018 = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Census Transport Means Data', '2018 Commute 1 year estimate.csv'))
df_commute_2019 = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Census Transport Means Data', '2019 Commute 1 year estimate.csv'))
df_commute_2020 = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Census Transport Means Data', '2020 Commute 5 year estimate.csv'))
df_commute_2021 = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Census Transport Means Data', '2021 Commute 1 year estimate.csv'))
df_commute_2022 = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Census Transport Means Data', '2022 Commute 1 year estimate.csv'))


  df_commute_2020 = pd.read_csv(os.path.join(path, 'Datasets', 'Raw Data', 'Census Transport Means Data', '2020 Commute 5 year estimate.csv'))


## 2. Combine all Commute Dataframes Together

Add a 'Year' column to each Dataframe to indicate what year the data corresponds to when combining all Dataframes.

In [6]:
# add a 'Year' column with corresponding year

df_commute_2018['Year']=2018
df_commute_2019['Year']=2019
df_commute_2020['Year']=2020
df_commute_2021['Year']=2021
df_commute_2022['Year']=2022

In [7]:
# Combine DataFrames 
frames = [df_commute_2018, df_commute_2019, df_commute_2020, df_commute_2021, df_commute_2022]
df_commute_all = pd.concat(frames, ignore_index=True)

In [8]:
df_commute_all.shape

(5323, 1167)

#### Remove columns with 'Annotation', 'Male', 'Female' and 'Percent Allocated'

In [9]:
# Define the words to check for in column headers
words_to_check = ['Annotation', 'Male', 'Female','PERCENT ALLOCATED', 'Margin of Error']

# Get a list of columns to remove
columns_to_remove = [col for col in df_commute_all.columns if any(word in col for word in words_to_check)]

# Remove the columns
df_commute_all_dropped = df_commute_all.drop(columns=columns_to_remove)

In [10]:
df_commute_all_dropped.shape

(5323, 76)

In [11]:
df_commute_all_dropped.isnull().sum()

Geography                                                                                                                                       0
Geographic Area Name                                                                                                                            0
Estimate!!Total!!Workers 16 years and over                                                                                                      0
Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van                                                 0
Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Drove alone                                    0
Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Carpooled                                      0
Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Carpooled!!In 2-person carp

In [12]:
df_commute_all_dropped

Unnamed: 0,Geography,Geographic Area Name,Estimate!!Total!!Workers 16 years and over,"Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van","Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Drove alone","Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Carpooled","Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Carpooled!!In 2-person carpool","Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Carpooled!!In 3-person carpool","Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Carpooled!!In 4-or-more person carpool","Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Car, truck, or van!!Workers per car, truck, or van",Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Public transportation (excluding taxicab),Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Walked,Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Bicycle,"Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Taxicab, motorcycle, or other means",Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Worked at home,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked in state of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked in state of residence!!Worked in county of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked in state of residence!!Worked outside county of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked outside state of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in a place,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in a place!!Worked in place of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in a place!!Worked outside place of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Not living in a place,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in 12 selected states,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in 12 selected states!!Worked in minor civil division of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in 12 selected states!!Worked outside minor civil division of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Not living in 12 selected states,Estimate!!Total!!Workers 16 years and over who did not work at home,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!12 00 a.m. to 4 59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!5 00 a.m. to 5 29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!5 30 a.m. to 5 59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!6 00 a.m. to 6 29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!6 30 a.m. to 6 59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!7 00 a.m. to 7 29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!7 30 a.m. to 7 59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!8 00 a.m. to 8 29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!8 30 a.m. to 8 59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TIME LEAVING HOME TO GO TO WORK!!9 00 a.m. to 11 59 p.m.,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!Less than 10 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!10 to 14 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!15 to 19 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!20 to 24 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!25 to 29 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!30 to 34 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!35 to 44 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!45 to 59 minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!60 or more minutes,Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!Mean travel time to work (minutes),Estimate!!Total!!VEHICLES AVAILABLE!!Workers 16 years and over in households,Estimate!!Total!!VEHICLES AVAILABLE!!Workers 16 years and over in households!!No vehicle available,Estimate!!Total!!VEHICLES AVAILABLE!!Workers 16 years and over in households!!1 vehicle available,Estimate!!Total!!VEHICLES AVAILABLE!!Workers 16 years and over in households!!2 vehicles available,Estimate!!Total!!VEHICLES AVAILABLE!!Workers 16 years and over in households!!3 or more vehicles available,Year,Estimate!!Total!!Workers 16 years and over!!MEANS OF TRANSPORTATION TO WORK!!Worked from home,Estimate!!Total!!Workers 16 years and over who did not work from home,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!12:00 a.m. to 4:59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!5:00 a.m. to 5:29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!5:30 a.m. to 5:59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!6:00 a.m. to 6:29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!6:30 a.m. to 6:59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!7:00 a.m. to 7:29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!7:30 a.m. to 7:59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!8:00 a.m. to 8:29 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!8:30 a.m. to 8:59 a.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TIME OF DEPARTURE TO GO TO WORK!!9:00 a.m. to 11:59 p.m.,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!Less than 10 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!10 to 14 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!15 to 19 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!20 to 24 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!25 to 29 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!30 to 34 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!35 to 44 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!45 to 59 minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!60 or more minutes,Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!Mean travel time to work (minutes)
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",115448,93,85.6,7.4,6.2,0.6,0.6,1.04,0.9,0.9,0.1,0.6,4.5,96.4,52.9,43.5,3.6,85.1,13.1,72.0,14.9,N,N,N,N,110196.0,5.7,4.1,5.7,11.6,10.2,14.4,12.9,9.2,6.1,20.3,9.5,12.0,11.7,10.7,5.3,14.8,9.4,13.5,13.2,31.5,115253,1.2,14.3,40.8,43.6,2018,,,,,,,,,,,,,,,,,,,,,,
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",55767,91.3,76.7,14.6,10.3,2.3,2,1.1,0.1,1.6,0,1.2,5.7,99.8,94,5.8,0.2,99.6,88.7,10.9,0.4,N,N,N,N,52568.0,5.5,2.4,5.0,4.1,9.1,13.7,20.0,8.4,6.2,25.4,24.8,29.7,26.2,11.0,0.5,3.4,0.7,1.2,2.6,15.2,54675,2.9,27.3,46.2,23.6,2018,,,,,,,,,,,,,,,,,,,,,,
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",72351,93.6,85.6,8,6.1,1,0.8,1.05,0.2,2,1,1.2,2,99.4,51.2,48.2,0.6,22.8,8.7,14.0,77.2,N,N,N,N,70902.0,6.7,4.3,3.4,13.2,12.2,22.7,8.8,8.9,1.7,18.1,16.4,12.1,17.4,12.5,7.3,9.5,7.0,6.2,11.5,26.3,71908,4.2,29.2,42.5,24.2,2018,,,,,,,,,,,,,,,,,,,,,,
3,400C100US00766,"Akron, OH Urbanized Area (2010)",274933,90.1,82.4,7.7,5.5,1.3,0.8,1.05,2,2,0.3,0.6,5.1,99.4,70.9,28.5,0.6,84.9,26.4,58.5,15.1,N,N,N,N,260894.0,3.4,2.9,5.3,9.1,11.5,14.6,11.6,10.2,5.0,26.4,12.5,15.2,16.4,17.9,9.4,12.1,6.3,6.1,4.2,23.1,272448,4.0,19.6,45.2,31.2,2018,,,,,,,,,,,,,,,,,,,,,,
4,400C100US00901,"Albany, GA Urbanized Area (2010)",N,N,N,N,N,N,N,N,N,N,N,N,N,99.9,67.9,31.9,0.1,72.0,53.6,18.4,28.0,N,N,N,N,36740.0,3.9,4.0,5.3,8.2,13.9,11.8,15.2,11.0,3.9,22.7,13.6,31.2,24.2,11.0,3.8,4.5,1.3,4.4,5.9,19.3,36970,5.9,31.5,38.3,24.3,2018,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",56507,89.5,80.1,9.4,7.4,0.6,1.4,1.06,0.2,1.2,0.3,0.9,,99.9,96.3,3.7,0.1,93.7,49.8,44.0,6.3,N,N,N,N,,,,,,,,,,,,,,,,,,,,,,56289,1.7,17.3,34.7,46.3,2022,7.9,52044,6.6,5.9,7.7,10.8,10.8,13.8,14.0,6.1,2.3,21.8,20.3,20.8,22.5,14.4,5.0,7.4,3.7,2.1,3.9,18.4
5319,400C200US97750,"York, PA Urban Area (2020)",121166,83.4,75.6,7.8,6.5,1,0.2,1.05,0.8,1.2,0.2,1.7,,94.4,82,12.4,5.6,60.7,12.8,47.9,39.3,100,23.9,76.1,0,,,,,,,,,,,,,,,,,,,,,,120099,2.4,17.6,44.7,35.2,2022,12.6,105849,7.5,3.6,6.8,11.5,10.4,13.0,10.9,7.4,4.6,24.5,10.5,18.7,17.5,15.0,6.0,10.8,5.8,8.1,7.6,25.3
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",142170,89.3,81.5,7.8,5.9,1,0.9,1.05,0.5,1.7,0.2,1.1,,94.1,69.9,24.2,5.9,69.2,17.0,52.2,30.8,N,N,N,N,,,,,,,,,,,,,,,,,,,,,,141374,2.3,23.8,43.0,30.9,2022,7.2,131977,4.9,4.2,4.8,6.9,8.1,13.0,11.6,11.3,5.7,29.5,16.2,17.0,20.0,17.0,6.9,9.7,4.3,3.8,5.1,21.6
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",53239,89.6,75.6,14,8.2,1.2,4.6,1.1,0.3,1.1,0.8,1.6,,99.4,48.1,51.3,0.6,93.4,30.0,63.3,6.6,N,N,N,N,,,,,,,,,,,,,,,,,,,,,,53027,4.0,12.6,34.2,49.1,2022,6.6,49739,9.5,10.6,5.4,9.3,10.3,11.2,11.3,9.7,3.0,19.7,14.0,15.3,12.0,8.5,2.2,10.5,9.1,15.9,12.4,31.0


Null values in columns are discussed in the project brief.

## 3. Split Dataframe into Different Category Dataframes

In [13]:
# Create a list of categories
categories = [
    'Means of Transportation',
    'Place of Work',
    'Time Leaving Home to Go to Work',
    'Time of Departure to Go to Work',
    'Travel Time to Work',
    'Vehicles Available'
]

# Create a dictionary to store the data frames for each category
category_data_frames = {}

# Loop through each category and create a data frame for it
for category in categories:
    # Select columns that match the capitalized category and the geographic columns
    selected_columns = ['Geography', 'Geographic Area Name','Year'] + [col for col in df_commute_all_dropped.columns if category.upper() in col]
    
    # Create a new data frame with the selected columns
    category_df = df_commute_all_dropped[selected_columns].copy()
    
    # Store the data frame in the dictionary with the category name as the key
    category_data_frames[category] = category_df

# Access the separate data frames as needed
means_of_transportation_df = category_data_frames['Means of Transportation']
place_of_work_df = category_data_frames['Place of Work']
time_leaving_home_df = category_data_frames['Time Leaving Home to Go to Work']
time_of_departure_df = category_data_frames['Time of Departure to Go to Work']
travel_time_df = category_data_frames['Travel Time to Work']
vehicles_available_df = category_data_frames['Vehicles Available']

In [14]:
print('Dataframe sizes before reformatting')

print('\nmeans_of_transportation_df')
print(means_of_transportation_df.shape)


print('\nplace_of_work_df')
print(place_of_work_df.shape)

print('\ntime_leaving_home_df')
print(time_leaving_home_df.shape)

print('\ntime_of_departure_df')
print(time_of_departure_df.shape)

print('\ntravel_time_df')
print(travel_time_df.shape)

print('\nvehicles_available_df')
print(vehicles_available_df.shape)

Dataframe sizes before reformatting

means_of_transportation_df
(5323, 16)

place_of_work_df
(5323, 15)

time_leaving_home_df
(5323, 13)

time_of_departure_df
(5323, 13)

travel_time_df
(5323, 23)

vehicles_available_df
(5323, 8)


## 4. Reformat Means of Transportation Dataframe Columns

In [15]:
# Create a function to generate the new column names
def transform_column_name(column_name):
    parts = column_name.split("!!")
    if len(parts) >= 3:
        return f'Commute by {parts[-1]}'
    else:
        return column_name  # If it doesn't match the expected pattern, keep the original name

# Use the function to create a mapping for renaming columns
column_mapping = {col: transform_column_name(col) for col in means_of_transportation_df.columns}

# Rename the columns
means_of_transportation_df_new_headers = means_of_transportation_df.rename(columns=column_mapping)

In [16]:
means_of_transportation_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,"Commute by Car, truck, or van",Commute by Drove alone,Commute by Carpooled,Commute by In 2-person carpool,Commute by In 3-person carpool,Commute by In 4-or-more person carpool,"Commute by Workers per car, truck, or van",Commute by Public transportation (excluding taxicab),Commute by Walked,Commute by Bicycle,"Commute by Taxicab, motorcycle, or other means",Commute by Worked at home,Commute by Worked from home
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,93,85.6,7.4,6.2,0.6,0.6,1.04,0.9,0.9,0.1,0.6,4.5,
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,91.3,76.7,14.6,10.3,2.3,2,1.1,0.1,1.6,0,1.2,5.7,
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,93.6,85.6,8,6.1,1,0.8,1.05,0.2,2,1,1.2,2,
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,90.1,82.4,7.7,5.5,1.3,0.8,1.05,2,2,0.3,0.6,5.1,
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,N,N,N,N,N,N,N,N,N,N,N,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,89.5,80.1,9.4,7.4,0.6,1.4,1.06,0.2,1.2,0.3,0.9,,7.9
5319,400C200US97750,"York, PA Urban Area (2020)",2022,83.4,75.6,7.8,6.5,1,0.2,1.05,0.8,1.2,0.2,1.7,,12.6
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,89.3,81.5,7.8,5.9,1,0.9,1.05,0.5,1.7,0.2,1.1,,7.2
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,89.6,75.6,14,8.2,1.2,4.6,1.1,0.3,1.1,0.8,1.6,,6.6


In [17]:
# Create a column mapping to rename and reformat means_of_transportation_df column headers
column_mapping = {
    'Commute by Workers per car, truck, or van': 'Average Number of Workers per Car',
    'Commute by Drove Alone': 'Commute by Driving Alone',
    'Commute by In 2-person carpool': 'Commute by 2-person Carpool',
    'Commute by In 3-person carpool': 'Commute by 3-person Carpool',
    'Commute by In 4-or-more person carpool': 'Commute by 4-or-more-person Carpool',
    'Commute by Car, truck, or van': 'Commute by Car, Truck, or Van',
    'Commute by Public transportation (excluding taxicab)': 'Commute by Public Transportation',
    'Commute by Walked': 'Commute by Walking',
    'Commute by Taxicab, motorcycle, or other means': 'Commute by Taxicab, Motorcycle, or Other means',
    'Commute by Worked at home': 'Commute by Worked at Home',
    'Commute by Worked from home': 'Commute by Work from Home'

}

# Use column mapping to rename means_of_transportation_df_new_headers column headers
means_of_transportation_df_new_headers = means_of_transportation_df_new_headers.rename(columns=column_mapping)

In [18]:
# Merge "Commute by Worked at Home" into "Commute by Work from Home" and fill null values
means_of_transportation_df_new_headers['Commute by Work from Home'] = means_of_transportation_df_new_headers['Commute by Work from Home'].fillna(means_of_transportation_df_new_headers['Commute by Worked at Home'])

# Drop the "Commute by Worked at Home" column if no longer needed
means_of_transportation_df_new_headers.drop(columns=['Commute by Worked at Home'], inplace=True)

In [19]:
# Convert "Commute by Work from Home" to numeric
means_of_transportation_df_new_headers['Commute by Work from Home'] = pd.to_numeric(means_of_transportation_df_new_headers['Commute by Work from Home'], errors='coerce')

# Remove rows with non-numeric values in "Commute by Work from Home"
means_of_transportation_df_new_headers = means_of_transportation_df_new_headers.dropna(subset=['Commute by Work from Home'])

In [20]:
means_of_transportation_df_new_headers.shape

(5024, 15)

In [21]:
means_of_transportation_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,"Commute by Car, Truck, or Van",Commute by Drove alone,Commute by Carpooled,Commute by 2-person Carpool,Commute by 3-person Carpool,Commute by 4-or-more-person Carpool,Average Number of Workers per Car,Commute by Public Transportation,Commute by Walking,Commute by Bicycle,"Commute by Taxicab, Motorcycle, or Other means",Commute by Work from Home
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,93,85.6,7.4,6.2,0.6,0.6,1.04,0.9,0.9,0.1,0.6,4.5
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,91.3,76.7,14.6,10.3,2.3,2,1.1,0.1,1.6,0,1.2,5.7
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,93.6,85.6,8,6.1,1,0.8,1.05,0.2,2,1,1.2,2.0
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,90.1,82.4,7.7,5.5,1.3,0.8,1.05,2,2,0.3,0.6,5.1
5,400C100US00970,"Albany--Schenectady, NY Urbanized Area (2010)",2018,85.4,76.9,8.5,6.9,1,0.5,1.06,5,4.3,0.3,1.1,3.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,89.5,80.1,9.4,7.4,0.6,1.4,1.06,0.2,1.2,0.3,0.9,7.9
5319,400C200US97750,"York, PA Urban Area (2020)",2022,83.4,75.6,7.8,6.5,1,0.2,1.05,0.8,1.2,0.2,1.7,12.6
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,89.3,81.5,7.8,5.9,1,0.9,1.05,0.5,1.7,0.2,1.1,7.2
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,89.6,75.6,14,8.2,1.2,4.6,1.1,0.3,1.1,0.8,1.6,6.6


## 5. Reformat Place of Work Dataframe 

In [22]:
# Drop 'selected regions' columns
place_of_work_df.drop(['Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in 12 selected states', 'Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in 12 selected states!!Worked in minor civil division of residence', 'Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in 12 selected states!!Worked outside minor civil division of residence', 'Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Not living in 12 selected states'], axis = 1, inplace = True)


In [23]:
place_of_work_df

Unnamed: 0,Geography,Geographic Area Name,Year,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked in state of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked in state of residence!!Worked in county of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked in state of residence!!Worked outside county of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Worked outside state of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in a place,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in a place!!Worked in place of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Living in a place!!Worked outside place of residence,Estimate!!Total!!Workers 16 years and over!!PLACE OF WORK!!Not living in a place
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,96.4,52.9,43.5,3.6,85.1,13.1,72.0,14.9
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,99.8,94,5.8,0.2,99.6,88.7,10.9,0.4
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,99.4,51.2,48.2,0.6,22.8,8.7,14.0,77.2
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,99.4,70.9,28.5,0.6,84.9,26.4,58.5,15.1
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,99.9,67.9,31.9,0.1,72.0,53.6,18.4,28.0
...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,99.9,96.3,3.7,0.1,93.7,49.8,44.0,6.3
5319,400C200US97750,"York, PA Urban Area (2020)",2022,94.4,82,12.4,5.6,60.7,12.8,47.9,39.3
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,94.1,69.9,24.2,5.9,69.2,17.0,52.2,30.8
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,99.4,48.1,51.3,0.6,93.4,30.0,63.3,6.6


In [24]:
# Create a function to generate the new column names
def transform_column_name(column_name):
    parts = column_name.split("!!")
    if len(parts) >= 3:
        return f'{parts[-1]}'
    else:
        return column_name  # If it doesn't match the expected pattern, keep the original name

# Use the function to create a mapping for renaming columns
column_mapping = {col: transform_column_name(col) for col in place_of_work_df.columns}

# Rename the columns
place_of_work_df_new_headers = place_of_work_df.rename(columns=column_mapping)

In [25]:
place_of_work_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,Worked in state of residence,Worked in county of residence,Worked outside county of residence,Worked outside state of residence,Living in a place,Worked in place of residence,Worked outside place of residence,Not living in a place
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,96.4,52.9,43.5,3.6,85.1,13.1,72.0,14.9
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,99.8,94,5.8,0.2,99.6,88.7,10.9,0.4
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,99.4,51.2,48.2,0.6,22.8,8.7,14.0,77.2
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,99.4,70.9,28.5,0.6,84.9,26.4,58.5,15.1
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,99.9,67.9,31.9,0.1,72.0,53.6,18.4,28.0
...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,99.9,96.3,3.7,0.1,93.7,49.8,44.0,6.3
5319,400C200US97750,"York, PA Urban Area (2020)",2022,94.4,82,12.4,5.6,60.7,12.8,47.9,39.3
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,94.1,69.9,24.2,5.9,69.2,17.0,52.2,30.8
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,99.4,48.1,51.3,0.6,93.4,30.0,63.3,6.6


In [26]:
# Create a column mapping to rename and reformat place_of_work_df_new_headers column headers
column_mapping = {
    'Worked in state of residence': 'Worked in State of Residence',
    'Worked in county of residence': 'Worked in County of Residence',
    'Worked outside county of residence': 'Worked Outside County of Residence',
    'Worked outside state of residence': 'Worked Outside State of Residence',
    'Living in a place': 'Living in a Place',
    'Worked in place of residence': 'Worked in Place of Residence',
    'Worked outside place of residence': 'Worked Outside Place of Residence',
    'Not Living in a place': 'Not Living in a Place'

}

# Use column mapping to rename place_of_work_df_new_headers column headers
place_of_work_df_new_headers = place_of_work_df_new_headers.rename(columns=column_mapping)

In [27]:
# Convert "Worked in County of Residence" to numeric
place_of_work_df_new_headers['Worked in County of Residence'] = pd.to_numeric(place_of_work_df_new_headers['Worked in County of Residence'], errors='coerce')

# Remove rows with non-numeric values in "Worked in County of Residence"
place_of_work_df_new_headers = place_of_work_df_new_headers.dropna(subset=['Worked in County of Residence'])

In [28]:
place_of_work_df_new_headers.shape

(5317, 11)

In [29]:
place_of_work_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,Worked in State of Residence,Worked in County of Residence,Worked Outside County of Residence,Worked Outside State of Residence,Living in a Place,Worked in Place of Residence,Worked Outside Place of Residence,Not living in a place
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,96.4,52.9,43.5,3.6,85.1,13.1,72.0,14.9
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,99.8,94.0,5.8,0.2,99.6,88.7,10.9,0.4
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,99.4,51.2,48.2,0.6,22.8,8.7,14.0,77.2
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,99.4,70.9,28.5,0.6,84.9,26.4,58.5,15.1
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,99.9,67.9,31.9,0.1,72.0,53.6,18.4,28.0
...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,99.9,96.3,3.7,0.1,93.7,49.8,44.0,6.3
5319,400C200US97750,"York, PA Urban Area (2020)",2022,94.4,82.0,12.4,5.6,60.7,12.8,47.9,39.3
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,94.1,69.9,24.2,5.9,69.2,17.0,52.2,30.8
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,99.4,48.1,51.3,0.6,93.4,30.0,63.3,6.6


## 6. Reformat Departure Time Dataframes

In [30]:
# Create a function to generate the new column names
def transform_column_name(column_name):
    parts = column_name.split("!!")
    if len(parts) >= 3:
        return f'Time Leaving {parts[-1]}'
    else:
        return column_name  # If it doesn't match the expected pattern, keep the original name

# Use the function to create a mapping for renaming columns
column_mapping = {col: transform_column_name(col) for col in time_leaving_home_df.columns}

# Rename the columns
time_leaving_home_df_new_headers = time_leaving_home_df.rename(columns=column_mapping)

In [31]:
# Create a function to generate the new column names
def transform_column_name(column_name):
    parts = column_name.split("!!")
    if len(parts) >= 3:
        return f'Depart Time {parts[-1]}'
    else:
        return column_name  # If it doesn't match the expected pattern, keep the original name

# Use the function to create a mapping for renaming columns
column_mapping = {col: transform_column_name(col) for col in time_of_departure_df.columns}

# Rename the columns
time_of_departure_df_new_headers = time_of_departure_df.rename(columns=column_mapping)

In [32]:
# Replace NaN values in "Depart Time" columns with values from "Time Leaving" columns

for time_col in range(3, 13):
    depart_col_name = time_of_departure_df_new_headers.columns[time_col]
    leaving_col_name = time_leaving_home_df_new_headers.columns[time_col]
    
    time_of_departure_df_new_headers[depart_col_name] = time_of_departure_df_new_headers[depart_col_name].fillna(time_leaving_home_df_new_headers[leaving_col_name])
    

In [33]:
time_of_departure_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,Depart Time 12:00 a.m. to 4:59 a.m.,Depart Time 5:00 a.m. to 5:29 a.m.,Depart Time 5:30 a.m. to 5:59 a.m.,Depart Time 6:00 a.m. to 6:29 a.m.,Depart Time 6:30 a.m. to 6:59 a.m.,Depart Time 7:00 a.m. to 7:29 a.m.,Depart Time 7:30 a.m. to 7:59 a.m.,Depart Time 8:00 a.m. to 8:29 a.m.,Depart Time 8:30 a.m. to 8:59 a.m.,Depart Time 9:00 a.m. to 11:59 p.m.
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,5.7,4.1,5.7,11.6,10.2,14.4,12.9,9.2,6.1,20.3
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,5.5,2.4,5.0,4.1,9.1,13.7,20.0,8.4,6.2,25.4
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,6.7,4.3,3.4,13.2,12.2,22.7,8.8,8.9,1.7,18.1
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,3.4,2.9,5.3,9.1,11.5,14.6,11.6,10.2,5.0,26.4
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,3.9,4.0,5.3,8.2,13.9,11.8,15.2,11.0,3.9,22.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,6.6,5.9,7.7,10.8,10.8,13.8,14.0,6.1,2.3,21.8
5319,400C200US97750,"York, PA Urban Area (2020)",2022,7.5,3.6,6.8,11.5,10.4,13.0,10.9,7.4,4.6,24.5
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,4.9,4.2,4.8,6.9,8.1,13.0,11.6,11.3,5.7,29.5
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,9.5,10.6,5.4,9.3,10.3,11.2,11.3,9.7,3.0,19.7


In [34]:
# Convert "Depart Time 12:00 a.m. to 4:59 a.m." to numeric
time_of_departure_df_new_headers['Depart Time 12:00 a.m. to 4:59 a.m.'] = pd.to_numeric(time_of_departure_df_new_headers['Depart Time 12:00 a.m. to 4:59 a.m.'], errors='coerce')

# Remove rows with non-numeric values in "Worked in County of Residence"
time_of_departure_df_new_headers = time_of_departure_df_new_headers.dropna(subset=['Depart Time 12:00 a.m. to 4:59 a.m.'])


In [35]:
time_of_departure_df_new_headers.shape

(5321, 13)

In [36]:
time_of_departure_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,Depart Time 12:00 a.m. to 4:59 a.m.,Depart Time 5:00 a.m. to 5:29 a.m.,Depart Time 5:30 a.m. to 5:59 a.m.,Depart Time 6:00 a.m. to 6:29 a.m.,Depart Time 6:30 a.m. to 6:59 a.m.,Depart Time 7:00 a.m. to 7:29 a.m.,Depart Time 7:30 a.m. to 7:59 a.m.,Depart Time 8:00 a.m. to 8:29 a.m.,Depart Time 8:30 a.m. to 8:59 a.m.,Depart Time 9:00 a.m. to 11:59 p.m.
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,5.7,4.1,5.7,11.6,10.2,14.4,12.9,9.2,6.1,20.3
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,5.5,2.4,5.0,4.1,9.1,13.7,20.0,8.4,6.2,25.4
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,6.7,4.3,3.4,13.2,12.2,22.7,8.8,8.9,1.7,18.1
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,3.4,2.9,5.3,9.1,11.5,14.6,11.6,10.2,5.0,26.4
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,3.9,4.0,5.3,8.2,13.9,11.8,15.2,11.0,3.9,22.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,6.6,5.9,7.7,10.8,10.8,13.8,14.0,6.1,2.3,21.8
5319,400C200US97750,"York, PA Urban Area (2020)",2022,7.5,3.6,6.8,11.5,10.4,13.0,10.9,7.4,4.6,24.5
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,4.9,4.2,4.8,6.9,8.1,13.0,11.6,11.3,5.7,29.5
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,9.5,10.6,5.4,9.3,10.3,11.2,11.3,9.7,3.0,19.7


## 7. Reformat Travel Time Dataframe
Because the US Census changed naming conventions of 'work at home' to 'work from home' starting in 2022, the travel_time_df has null values where the 2018-2021 rows in the 'work from home' columns while the 2022 rows have null values within the 'work at home' columns. <br>

To keep consistency, create a new dataframe that uses only the updated 'work from home' convention

In [37]:
work_at_home_df = travel_time_df[[
    'Geography',
    'Geographic Area Name',
    'Year',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!Less than 10 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!10 to 14 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!15 to 19 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!20 to 24 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!25 to 29 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!30 to 34 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!35 to 44 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!45 to 59 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!60 or more minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work at home!!TRAVEL TIME TO WORK!!Mean travel time to work (minutes)'
]]

work_from_home_df = travel_time_df[[
    'Geography',
    'Geographic Area Name',
    'Year',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!Less than 10 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!10 to 14 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!15 to 19 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!20 to 24 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!25 to 29 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!30 to 34 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!35 to 44 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!45 to 59 minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!60 or more minutes',
    'Estimate!!Total!!Workers 16 years and over who did not work from home!!TRAVEL TIME TO WORK!!Mean travel time to work (minutes)'
]]

In [38]:
# Rename columns of the work_at_home_df to match the work_from_home_df column names
work_at_home_df.columns = work_at_home_df.columns[:3].tolist() + work_from_home_df.columns[3:].str.replace("work at home", "work from home").tolist()


In [39]:
travel_time_df_reduced = pd.concat([work_at_home_df, work_from_home_df], ignore_index=True)

In [40]:
# Create a function to generate the new column names
def transform_column_name(column_name):
    parts = column_name.split("!!")
    if len(parts) >= 3:
        return f'Commute Time {parts[-1]}'
    else:
        return column_name  # If it doesn't match the expected pattern, keep the original name

# Use the function to create a mapping for renaming columns
column_mapping = {col: transform_column_name(col) for col in travel_time_df_reduced.columns}

# Rename the columns
travel_time_df_reduced_new_headers = travel_time_df_reduced.rename(columns=column_mapping)

In [41]:
# Use the str.replace method to change 'minutes' to 'Minutes' in column names
travel_time_df_reduced_new_headers.columns = travel_time_df_reduced_new_headers.columns.str.replace('minutes', 'Minutes')

# Create a column mapping to rename and reformat travel_time_df_reduced_new_headers column headers
column_mapping = {
    'Commute Time Less than 10 Minutes': 'Commute Time <10 Minutes',
    'Commute Time Mean travel time to work (Minutes)': 'Mean Commute Time',
    'Commute Time 60 or more Minutes': 'Commute Time >60 Minutes'
    
}

# Use column mapping to rename travel_time_df_reduced_new_headers column headers
travel_time_df_reduced_new_headers = travel_time_df_reduced_new_headers.rename(columns=column_mapping)

In [42]:
# Use the dropna method to remove rows with any null values
travel_time_df_reduced_new_headers = travel_time_df_reduced_new_headers.dropna()

In [43]:
# Convert "Mean Commute Time" to numeric
travel_time_df_reduced_new_headers['Mean Commute Time'] = pd.to_numeric(travel_time_df_reduced_new_headers['Mean Commute Time'], errors='coerce')

# Remove rows with non-numeric values in "Mean Commute Time"
travel_time_df_reduced_new_headers = travel_time_df_reduced_new_headers.dropna(subset=['Mean Commute Time'])


In [44]:
travel_time_df_reduced_new_headers.shape

(5321, 13)

In [45]:
travel_time_df_reduced_new_headers.reset_index(inplace = True)

In [46]:
travel_time_df_reduced_new_headers.drop(columns = ['index'])

Unnamed: 0,Geography,Geographic Area Name,Year,Commute Time <10 Minutes,Commute Time 10 to 14 Minutes,Commute Time 15 to 19 Minutes,Commute Time 20 to 24 Minutes,Commute Time 25 to 29 Minutes,Commute Time 30 to 34 Minutes,Commute Time 35 to 44 Minutes,Commute Time 45 to 59 Minutes,Commute Time >60 Minutes,Mean Commute Time
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,9.5,12.0,11.7,10.7,5.3,14.8,9.4,13.5,13.2,31.5
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,24.8,29.7,26.2,11.0,0.5,3.4,0.7,1.2,2.6,15.2
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,16.4,12.1,17.4,12.5,7.3,9.5,7.0,6.2,11.5,26.3
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,12.5,15.2,16.4,17.9,9.4,12.1,6.3,6.1,4.2,23.1
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,13.6,31.2,24.2,11.0,3.8,4.5,1.3,4.4,5.9,19.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,400C200US97507,"Yakima, WA Urban Area (2020)",2022,20.3,20.8,22.5,14.4,5.0,7.4,3.7,2.1,3.9,18.4
5317,400C200US97750,"York, PA Urban Area (2020)",2022,10.5,18.7,17.5,15.0,6.0,10.8,5.8,8.1,7.6,25.3
5318,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,16.2,17.0,20.0,17.0,6.9,9.7,4.3,3.8,5.1,21.6
5319,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,14.0,15.3,12.0,8.5,2.2,10.5,9.1,15.9,12.4,31.0


In [47]:
travel_time_df_reduced_new_headers.shape

(5321, 14)

In [48]:
travel_time_df_reduced_new_headers

Unnamed: 0,index,Geography,Geographic Area Name,Year,Commute Time <10 Minutes,Commute Time 10 to 14 Minutes,Commute Time 15 to 19 Minutes,Commute Time 20 to 24 Minutes,Commute Time 25 to 29 Minutes,Commute Time 30 to 34 Minutes,Commute Time 35 to 44 Minutes,Commute Time 45 to 59 Minutes,Commute Time >60 Minutes,Mean Commute Time
0,0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,9.5,12.0,11.7,10.7,5.3,14.8,9.4,13.5,13.2,31.5
1,1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,24.8,29.7,26.2,11.0,0.5,3.4,0.7,1.2,2.6,15.2
2,2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,16.4,12.1,17.4,12.5,7.3,9.5,7.0,6.2,11.5,26.3
3,3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,12.5,15.2,16.4,17.9,9.4,12.1,6.3,6.1,4.2,23.1
4,4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,13.6,31.2,24.2,11.0,3.8,4.5,1.3,4.4,5.9,19.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,10641,400C200US97507,"Yakima, WA Urban Area (2020)",2022,20.3,20.8,22.5,14.4,5.0,7.4,3.7,2.1,3.9,18.4
5317,10642,400C200US97750,"York, PA Urban Area (2020)",2022,10.5,18.7,17.5,15.0,6.0,10.8,5.8,8.1,7.6,25.3
5318,10643,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,16.2,17.0,20.0,17.0,6.9,9.7,4.3,3.8,5.1,21.6
5319,10644,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,14.0,15.3,12.0,8.5,2.2,10.5,9.1,15.9,12.4,31.0


## 8. Reformat Household Vehicles 

In [49]:
# Create a function to generate the new column names
def transform_column_name(column_name):
    parts = column_name.split("!!")
    if len(parts) >= 3:
        return f'Households with {parts[-1]}'
    else:
        return column_name  # If it doesn't match the expected pattern, keep the original name

# Use the function to create a mapping for renaming columns
column_mapping = {col: transform_column_name(col) for col in vehicles_available_df.columns}

# Rename the columns
vehicles_available_df_new_headers = vehicles_available_df.rename(columns=column_mapping)

In [50]:
vehicles_available_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,Households with Workers 16 years and over in households,Households with No vehicle available,Households with 1 vehicle available,Households with 2 vehicles available,Households with 3 or more vehicles available
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,115253,1.2,14.3,40.8,43.6
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,54675,2.9,27.3,46.2,23.6
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,71908,4.2,29.2,42.5,24.2
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,272448,4.0,19.6,45.2,31.2
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,36970,5.9,31.5,38.3,24.3
...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,56289,1.7,17.3,34.7,46.3
5319,400C200US97750,"York, PA Urban Area (2020)",2022,120099,2.4,17.6,44.7,35.2
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,141374,2.3,23.8,43.0,30.9
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,53027,4.0,12.6,34.2,49.1


In [51]:
# Create a column mapping to rename and reformat vehicles_available_df_new_headers column headers
column_mapping = {
    'Households with Workers 16 years and over in households': 'Workers 16 Years and Over in Households',
    'Households with No vehicle available': 'Households with 0 Vehicles Available',
    'Households with 1 vehicle available': 'Households with 1 Vehicle Available',
    'Households with 2 vehicles available': 'Households with 2 Vehicles Available',
    'Households with 3 or more vehicles available': 'Households with 3 or More Vehicles Available'
}

# Use column mapping to rename vehicles_available_df_new_headers column headers
vehicles_available_df_new_headers = vehicles_available_df_new_headers.rename(columns=column_mapping)

In [53]:
# Convert "Households with 0 Vehicles available" to numeric
vehicles_available_df_new_headers['Households with 0 Vehicles Available'] = pd.to_numeric(vehicles_available_df_new_headers['Households with 0 Vehicles Available'], errors='coerce')

# Remove rows with non-numeric values in "Households with 0 Vehicles available"
vehicles_available_df_new_headers = vehicles_available_df_new_headers.dropna(subset=['Households with 0 Vehicles Available'])


In [54]:
vehicles_available_df_new_headers.shape

(5323, 8)

In [55]:
vehicles_available_df_new_headers

Unnamed: 0,Geography,Geographic Area Name,Year,Workers 16 Years and Over in Households,Households with 0 Vehicles Available,Households with 1 Vehicle Available,Households with 2 Vehicles Available,Households with 3 or More Vehicles Available
0,400C100US00199,"Aberdeen--Bel Air South--Bel Air North, MD Urb...",2018,115253,1.2,14.3,40.8,43.6
1,400C100US00280,"Abilene, TX Urbanized Area (2010)",2018,54675,2.9,27.3,46.2,23.6
2,400C100US00631,"Aguadilla--Isabela--San Sebastián, PR Urbanize...",2018,71908,4.2,29.2,42.5,24.2
3,400C100US00766,"Akron, OH Urbanized Area (2010)",2018,272448,4.0,19.6,45.2,31.2
4,400C100US00901,"Albany, GA Urbanized Area (2010)",2018,36970,5.9,31.5,38.3,24.3
...,...,...,...,...,...,...,...,...
5318,400C200US97507,"Yakima, WA Urban Area (2020)",2022,56289,1.7,17.3,34.7,46.3
5319,400C200US97750,"York, PA Urban Area (2020)",2022,120099,2.4,17.6,44.7,35.2
5320,400C200US97831,"Youngstown, OH Urban Area (2020)",2022,141374,2.3,23.8,43.0,30.9
5321,400C200US97939,"Yuba City, CA Urban Area (2020)",2022,53027,4.0,12.6,34.2,49.1


## 9. Export Categories Dataframes

In [96]:
# Export commute modeshare data frame into csv file in clean data folder as 'commute_modeshare_2018_to_2022.csv'
means_of_transportation_df_new_headers.to_csv(os.path.join(path, 'Datasets', 'Clean Data', 'commute_modeshare_2018_to_2022.csv'), index=False)

In [97]:
# Export place of work data frame into csv file in clean data folder as 'place_of_work_2018_to_2022.csv'
place_of_work_df_new_headers.to_csv(os.path.join(path, 'Datasets', 'Clean Data', 'place_of_work_2018_to_2022.csv'), index=False)

In [98]:
# Export depart time data frame into csv file in clean data folder as 'depart_time_2018_to_2022.csv'
time_of_departure_df_new_headers.to_csv(os.path.join(path, 'Datasets', 'Clean Data', 'depart_time_2018_to_2022.csv'), index=False)

In [99]:
# Export commute time data frame into csv file in clean data folder as 'commute_time_2018_to_2022.csv'
travel_time_df_reduced_new_headers.to_csv(os.path.join(path, 'Datasets', 'Clean Data', 'commute_time_2018_to_2022.csv'), index=False)

In [100]:
# Export household vehicles data frame into csv file in clean data folder as 'household_vehicles_2018_to_2022.csv'
vehicles_available_df_new_headers.to_csv(os.path.join(path, 'Datasets', 'Clean Data', 'household_vehicles_2018_to_2022.csv'), index=False)