In [1]:
import pandas as pd

In [2]:
# import data tables
df_pm = pd.read_csv('./California_Population.csv')
df_vehicles = pd.read_csv('./vehicle_df')

In [3]:
# seperating creating ev/gas columns so its easier to compare with pm2.5 values 
df_ev_vehicles = df_vehicles[df_vehicles['Fuel Category'] == 'EV']
df_ev_vehicles = df_ev_vehicles.rename(columns={'Number of Vehicles': 'Number of EV'})
df_ev_vehicles = df_ev_vehicles.drop(columns=['Fuel Category'])
df_ev_vehicles

Unnamed: 0,Data Year,County,Number of EV
0,2010,Alameda,20
2,2010,Alpine,0
4,2010,Amador,1
6,2010,Butte,0
8,2010,Calaveras,0
...,...,...,...
1642,2023,Tulare,3837
1644,2023,Tuolumne,698
1646,2023,Ventura,31980
1648,2023,Yolo,6757


In [4]:
df_gas_vehicles = df_vehicles[df_vehicles['Fuel Category'] == 'Fossil Fuel']
df_gas_vehicles = df_gas_vehicles.rename(columns={'Number of Vehicles': 'Number of Gas Cars'})
df_gas_vehicles = df_gas_vehicles.drop(columns=['Fuel Category'])
df_gas_vehicles

Unnamed: 0,Data Year,County,Number of Gas Cars
1,2010,Alameda,885402
3,2010,Alpine,1041
5,2010,Amador,30480
7,2010,Butte,133169
9,2010,Calaveras,39636
...,...,...,...
1643,2023,Tulare,331744
1645,2023,Tuolumne,54369
1647,2023,Ventura,651851
1649,2023,Yolo,152560


In [5]:
# recombining df_vehicles 
df_vehicles = pd.merge(df_gas_vehicles, df_ev_vehicles, on=['Data Year', 'County'], how='inner')
df_vehicles.shape

(826, 4)

In [6]:
df_pm.shape

(870, 4)

In [14]:
# strip df_pm of spaces 
df_pm['County'] = df_pm['County'].str.strip()
counties = pd.DataFrame({
    'County_df_vehicles': df_vehicles['County'],
    'County_df_pm': df_pm['County']
})
counties.head()
# Get unique counties from both DataFrames
counties_vehicles = set(df_vehicles['County'])
counties_pm = set(df_pm['County'])

# Find differences
only_in_vehicles = counties_vehicles - counties_pm
only_in_pm = counties_pm - counties_vehicles

print("Only in df_vehicles:", only_in_vehicles)
print("Only in df_pm:", only_in_pm)

# do we want to drop these values ? i will merge on left for now, idk if we will need these since they don't have pm2.5 data at all. we could estimate these values by comparing the counties around them that do report pm2.5 values. 

Only in df_vehicles: {'Out Of State', 'Alpine'}
Only in df_pm: {'State Total'}


In [16]:
# check shapes
df_pm

(870, 4)

In [17]:
df_vehicles.shape

(826, 4)

In [18]:
# merging pm and vehicle datatables
df_pm_vehicles = pd.merge(df_pm, df_vehicles, on=['Data Year', 'County'], how='inner')
df_pm_vehicles

Unnamed: 0,County,Data Year,Population,Z Score,Number of Gas Cars,Number of EV
0,Alameda,2010,1510271,0.033519,885402,20
1,Amador,2010,38091,-0.251576,30480,1
2,Butte,2010,220000,-0.216348,133169,0
3,Calaveras,2010,45578,-0.250126,39636,0
4,Colusa,2010,21419,-0.254805,14610,0
...,...,...,...,...,...,...
793,Tulare,2023,474680,-0.167028,331744,3837
794,Tuolumne,2023,54626,-0.248374,54369,698
795,Ventura,2023,825960,-0.099001,651851,31980
796,Yolo,2023,220454,-0.216260,152560,6757


In [19]:
df_pm_vehicles.to_csv('/Users/clarissaclark/SIADS_Milestone_1/PM2.5_Vehicles.csv', index=False)

In [20]:
# now let's combine that dataframe with population 
df_pop = pd.read_csv('/Users/clarissaclark/SIADS_Milestone_1/California_Population.csv') 
# clean df_pop county column to match df_pm_vehicle county column 
df_pop['County'] = df_pop['County'].str.strip()
df_pop['County'] = df_pop['County'].str.title()
df_pop['County'] = df_pop['County'].str.replace(r'\s+', ' ', regex=True) 

In [21]:
# Get unique counties from both DataFrames
counties_pop = set(df_pop['County'])
counties_pm_vehicles = set(df_pm_vehicles['County'])

# Find differences
only_in_pop = counties_pop - counties_pm_vehicles
only_in_pm_vehicles = counties_pm_vehicles - counties_pop

print("Only in df_pop:", only_in_pop)
print("Only in df_pm_vehicles:", only_in_pm_vehicles)
# for now, we will merge on left for df_pm_vehicles 

Only in df_pop: {'State Total'}
Only in df_pm_vehicles: set()


In [26]:
df_pm_vehicles_pop= pd.merge(df_pm_vehicles, df_pop, on=['County', 'Data Year'], how='inner')
df_pm_vehicles_pop.rename(columns={'Z Score_x':'Total Population Z Score'}, inplace=True)
df_pm_vehicles_pop.rename(columns={'Population_x':'Total Population'}, inplace=True)
df_pm_vehicles_pop

Unnamed: 0,County,Data Year,Total Population,Total Population Z Score,Number of Gas Cars,Number of EV,Population_y,Z Score_y
0,Alameda,2010,1510271,0.033519,885402,20,1510271,0.033519
1,Amador,2010,38091,-0.251576,30480,1,38091,-0.251576
2,Butte,2010,220000,-0.216348,133169,0,220000,-0.216348
3,Calaveras,2010,45578,-0.250126,39636,0,45578,-0.250126
4,Colusa,2010,21419,-0.254805,14610,0,21419,-0.254805
...,...,...,...,...,...,...,...,...
793,Tulare,2023,474680,-0.167028,331744,3837,474680,-0.167028
794,Tuolumne,2023,54626,-0.248374,54369,698,54626,-0.248374
795,Ventura,2023,825960,-0.099001,651851,31980,825960,-0.099001
796,Yolo,2023,220454,-0.216260,152560,6757,220454,-0.216260


In [27]:
# to csv 
df_pm_vehicles_pop.to_csv('PM2.5_Vehicles_Population.csv', index=False)