In [1]:
import pandas as pd

# Load city-level data
city_path = "temperatureData/GlobalLandTemperaturesByCity.csv"
df_city = pd.read_csv(city_path, parse_dates=['dt'])

# Filter for US cities from 1998–2019 and drop rows with missing temperature
df_city_us = df_city[
    (df_city['Country'] == 'United States') &
    (df_city['dt'].dt.year >= 1998) &
    (df_city['dt'].dt.year <= 2019)
].dropna(subset=['AverageTemperature'])

# Standardize column names
df_city_us = df_city_us.rename(columns={
    'dt': 'Date',
    'City': 'Location',
    'AverageTemperature': 'Temp_C'
})[['Date', 'Location', 'Temp_C']]


In [2]:
region_to_cities = {
    "AEP": ["Akron", "Columbus", "Dayton", "Charleston", "Indianapolis", "Lexington Fayette", "Huntington"],
    "COMED": ["Chicago", "Rockford", "Aurora", "Naperville", "Joliet"],
    "DAYTON": ["Dayton"],
    "DEOK": ["Cincinnati", "Lexington Fayette"],
    "DOM": ["Charlotte", "Raleigh", "Richmond", "Virginia Beach", "Washington"],
    "DUQ": ["Pittsburgh"],
    "EKPC": ["Lexington Fayette", "Louisville"],
    "FE": ["Akron", "Allentown", "Cleveland", "Newark", "Baltimore"],
    "NI": ["South Bend", "Fort Wayne"],
    "PJM_Load": ["Columbus", "Detroit", "Charlotte", "Chicago", "Baltimore",
                 "Philadelphia", "Cleveland", "Indianapolis", "Louisville",
                 "Nashville", "Jersey City", "Akron"],
    "PJME": ["Baltimore", "Newark", "Jersey City", "Washington"],
    "PJMW": ["Charleston", "Lexington Fayette", "Columbus", "Dayton"]
}

city_to_region = {
    city: region for region, cities in region_to_cities.items() for city in cities
}


In [3]:
# Map region info
df_city_us['Region'] = df_city_us['Location'].map(city_to_region)

# Keep only rows where Region is known (drop NaN)
df_city_us = df_city_us.dropna(subset=['Region'])

print("✅ Filtered to cities with region info only")
df_city_us.head()


✅ Filtered to cities with region info only


Unnamed: 0,Date,Location,Temp_C,Region
140116,1998-01-01,Akron,1.729,PJM_Load
140117,1998-02-01,Akron,3.222,PJM_Load
140118,1998-03-01,Akron,5.075,PJM_Load
140119,1998-04-01,Akron,10.048,PJM_Load
140120,1998-05-01,Akron,18.207,PJM_Load


In [4]:
output_path = "temperatureData_clean/US_Temp_City_RegionOnly_1998_2019.csv"
df_city_us.to_csv(output_path, index=False)

print(f"✅ Final dataset saved: {output_path}")

✅ Final dataset saved: temperatureData_clean/US_Temp_City_RegionOnly_1998_2019.csv
