In [None]:
import pandas as pd

# Load city temperature data
city_path = "temperatureData/GlobalLandTemperaturesByCity.csv"
df_city = pd.read_csv(city_path, parse_dates=['dt'])

# Filter for US cities and years 1998–2019
df_city_us = df_city[
    (df_city['Country'] == 'United States') &
    (df_city['dt'].dt.year >= 1998) &
    (df_city['dt'].dt.year <= 2019)
].dropna(subset=['AverageTemperature'])

# Rename and select relevant columns
df_city_us = df_city_us.rename(columns={
    'dt': 'Date',
    'City': 'Location',
    'AverageTemperature': 'Temp_C'
})[['Date', 'Location', 'Temp_C']]

print("✅ Cleaned city-level temperature data loaded")
df_city_us.head()


In [None]:
# PJM region to city mapping
region_to_cities = {
    "AEP": ["Akron", "Columbus", "Dayton", "Charleston", "Indianapolis", "Lexington Fayette", "Huntington"],
    "COMED": ["Chicago", "Rockford", "Aurora", "Naperville", "Joliet"],
    "DAYTON": ["Dayton"],
    "DEOK": ["Cincinnati", "Lexington Fayette"],
    "DOM": ["Charlotte", "Raleigh", "Richmond", "Virginia Beach", "Washington"],
    "DUQ": ["Pittsburgh"],
    "EKPC": ["Lexington Fayette", "Louisville"],
    "FE": ["Akron", "Allentown", "Cleveland", "Newark", "Baltimore"],
    "NI": ["South Bend", "Fort Wayne"],
    "PJM_Load": ["Columbus", "Detroit", "Charlotte", "Chicago", "Baltimore",
                 "Philadelphia", "Cleveland", "Indianapolis", "Louisville",
                 "Nashville", "Jersey City", "Akron"],
    "PJME": ["Baltimore", "Newark", "Jersey City", "Washington"],
    "PJMW": ["Charleston", "Lexington Fayette", "Columbus", "Dayton"]
}

# Reverse mapping: city → region
city_to_region = {
    city: region for region, cities in region_to_cities.items() for city in cities
}


In [None]:
# Add region column based on city name
df_city_us['Region'] = df_city_us['Location'].map(city_to_region)

print("✅ Region column added")
df_city_us.head()


In [None]:
# Save with region info
with_region_path = "temperatureData_clean/US_Temp_City_Region_1998_2019.csv"
df_city_us.to_csv(with_region_path, index=False)

print(f"✅ Saved with region info: {with_region_path}")


In [None]:
# Drop Region column to get clean data without region info
df_city_clean = df_city_us.drop(columns='Region')

# Save cleaned version
clean_path = "temperatureData_clean/US_Temp_CityOnly_Clean_1998_2019.csv"
df_city_clean.to_csv(clean_path, index=False)

print(f"✅ Saved clean dataset without region info: {clean_path}")
