In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.geocoders import Nominatim

Used Nominatim to match geocodes with US cities from the kaggle global air pollution dataset. Then pulled latitude and longitude data from those geocode Location objects to process into shapely Point geometries as part of a geopandas GeoDataFrame. Exported the dataframe into a shp file for later use.

In [12]:
df = pd.read_csv('datasets/global air pollution dataset.csv')
df_us = df[df['Country'] == 'United States of America']
df_us = df_us.drop(['Country'], axis=1).reset_index()
df_us = df_us.drop('index', axis=1)
df_us

Unnamed: 0,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category
0,Punta Gorda,54,Moderate,1,Good,14,Good,11,Good,54,Moderate
1,Phoenix,72,Moderate,1,Good,4,Good,23,Good,72,Moderate
2,Pontiac,77,Moderate,2,Good,22,Good,15,Good,77,Moderate
3,Port Neches,34,Good,1,Good,19,Good,7,Good,34,Good
4,Portales,77,Moderate,1,Good,34,Good,0,Good,77,Moderate
...,...,...,...,...,...,...,...,...,...,...,...
2867,Reston,60,Moderate,1,Good,40,Good,5,Good,60,Moderate
2868,Sand Springs,56,Moderate,1,Good,49,Good,2,Good,56,Moderate
2869,El Reno,39,Good,1,Good,39,Good,1,Good,35,Good
2870,Highland Springs,54,Moderate,1,Good,34,Good,5,Good,54,Moderate


In [13]:
geolocator = Nominatim(user_agent="air-pollution")

In [24]:
# New column geocode for geopy Location objects
df_us['geocode'] = df_us.apply(lambda row: geolocator.geocode(row.City + " United States"), axis=1)

In [26]:
# drop rows with missing values
df_us = df_us.dropna(axis=0)

In [27]:
df_us

Unnamed: 0,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category,geocode
0,Punta Gorda,54,Moderate,1,Good,14,Good,11,Good,54,Moderate,"(Punta Gorda, Charlotte County, Florida, Unite..."
1,Phoenix,72,Moderate,1,Good,4,Good,23,Good,72,Moderate,"(Phoenix, Maricopa County, Arizona, United Sta..."
2,Pontiac,77,Moderate,2,Good,22,Good,15,Good,77,Moderate,"(Pontiac, Oakland County, Michigan, United Sta..."
3,Port Neches,34,Good,1,Good,19,Good,7,Good,34,Good,"(Port Neches, Jefferson County, Texas, 77651, ..."
4,Portales,77,Moderate,1,Good,34,Good,0,Good,77,Moderate,"(Portales, Roosevelt County, New Mexico, 88130..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2867,Reston,60,Moderate,1,Good,40,Good,5,Good,60,Moderate,"(Reston, Fairfax County, Virginia, United Stat..."
2868,Sand Springs,56,Moderate,1,Good,49,Good,2,Good,56,Moderate,"(Sand Springs, Tulsa County, Oklahoma, United ..."
2869,El Reno,39,Good,1,Good,39,Good,1,Good,35,Good,"(El Reno, Canadian County, Oklahoma, United St..."
2870,Highland Springs,54,Moderate,1,Good,34,Good,5,Good,54,Moderate,"(Highland Springs, Lake County, California, Un..."


In [28]:
geometry = []
for index, row in df_us.iterrows():
    location = row.geocode
    geometry.append(Point(location.longitude, location.latitude))
gs = gpd.GeoSeries(geometry)
gdf_us = gpd.GeoDataFrame(df_us, geometry=gs, crs="EPSG:4326")
for index, row in gdf_us[gdf_us.isnull().any(axis=1)].iterrows():
    print(row.geocode)

Highland Springs, Lake County, California, United States
Westerville, Franklin County, Ohio, 43081, United States


In [29]:
gdf_us = gdf_us.dropna(axis=0)
gdf_us[gdf_us.isnull().any(axis=1)]

Unnamed: 0,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category,geocode,geometry


In [30]:
gdf_us = gdf_us.drop('geocode', axis=1)

In [31]:
gdf_us.head(5)

Unnamed: 0,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category,geometry
0,Punta Gorda,54,Moderate,1,Good,14,Good,11,Good,54,Moderate,POINT (-82.04537 26.92978)
1,Phoenix,72,Moderate,1,Good,4,Good,23,Good,72,Moderate,POINT (-112.07414 33.44844)
2,Pontiac,77,Moderate,2,Good,22,Good,15,Good,77,Moderate,POINT (-83.29105 42.63892)
3,Port Neches,34,Good,1,Good,19,Good,7,Good,34,Good,POINT (-93.95851 29.99132)
4,Portales,77,Moderate,1,Good,34,Good,0,Good,77,Moderate,POINT (-103.33440 34.18619)


In [32]:
gdf_us.to_file("us-cities-air-pollution.shp")

  gdf_us.to_file("us-cities-air-pollution.shp")
