In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
# !pip install wwo-hist

In [3]:
from wwo_hist import retrieve_hist_data

import os

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [5]:
#Raw content URL for adult-all.txt
url = "https://raw.githubusercontent.com/hastighsh/Ellehacks_2024/main/power_outages_data_copy.csv"

#Specify the delimiter (assuming it's a tab-separated file)
delimiter = ','

#Read the data into a DataFrame
power_outages_data = pd.read_csv(url, delimiter=delimiter)

#Let's create a backup copy of the dataset
outage_backup = power_outages_data.copy()

In [6]:
power_outages_data

Unnamed: 0.1,Unnamed: 0,Event Description,Year,Date Event Began,Date of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Time Event Began,Time of Restoration
0,0,Severe Weather - Thunderstorms,2014,6/30/2014,7/2/2014,Exelon Corporation/ComEd,Illinois,RFC,Unknown,420000,"severe weather, thunderstorm",20:00:00,18:30:00
1,1,Severe Weather - Thunderstorms,2014,6/30/2014,7/1/2014,Northern Indiana Public Service Company,North Central Indiana,RFC,Unknown,127000,"severe weather, thunderstorm",23:20:00,17:00:00
2,2,Severe Weather - Thunderstorms,2014,6/30/2014,7/1/2014,We Energies,Southeast Wisconsin,MRO,424,120000,"severe weather, thunderstorm",17:55:00,02:53:00
3,3,Fuel Supply Emergency - Coal,2014,6/27/2014,Unknown,We Energies,Wisconsin,MRO,Unknown,Unknown,"fuel supply emergency, coal",13:21:00,
4,4,Physical Attack - Vandalism,2014,6/24/2014,6/24/2014,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",14:54:00,14:55:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,1647,Transmission Line Loss,2000,3/18/2000,3/18/2000,El Paso Elec. Co.,Texas,MAIN,400,100000,transmission interruption,16:00:00,17:10:00
1648,1648,Vandalism,2000,3/14/2000,,Alliant Energy,Maine,MAIN,,,vandalism,21:06:00,
1649,1649,Ice Storm,2000,1/29/2000,2/3/2000,Duke Power Co.,South Carolina,SERC,300,81000,"severe weather, winter storm",22:00:00,12:00:00
1650,1650,Ice Storm,2000,1/24/2000,,Carolina Power & Light,North Carolina & Northern South Carolina,SERC,960,173000,"severe weather, winter storm",19:00:00,


In [7]:
power_outages_data = power_outages_data.drop(["Unnamed: 0", "NERC Region", "Demand Loss (MW)", "Number of Customers Affected"], axis=1)

In [8]:
# Count missing values in each column
missing_values = power_outages_data.isnull().sum()

print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Event Description       0
Year                    0
Date Event Began        0
Date of Restoration    14
Respondent              0
Geographic Areas        1
Tags                    1
Time Event Began       12
Time of Restoration    44
dtype: int64


In [9]:
cleaned_power_outages = power_outages_data.dropna()

In [10]:
cleaned_power_outages

Unnamed: 0,Event Description,Year,Date Event Began,Date of Restoration,Respondent,Geographic Areas,Tags,Time Event Began,Time of Restoration
0,Severe Weather - Thunderstorms,2014,6/30/2014,7/2/2014,Exelon Corporation/ComEd,Illinois,"severe weather, thunderstorm",20:00:00,18:30:00
1,Severe Weather - Thunderstorms,2014,6/30/2014,7/1/2014,Northern Indiana Public Service Company,North Central Indiana,"severe weather, thunderstorm",23:20:00,17:00:00
2,Severe Weather - Thunderstorms,2014,6/30/2014,7/1/2014,We Energies,Southeast Wisconsin,"severe weather, thunderstorm",17:55:00,02:53:00
4,Physical Attack - Vandalism,2014,6/24/2014,6/24/2014,Tennessee Valley Authority,"Nashville, Tennessee","vandalism, physical",14:54:00,14:55:00
5,Physical Attack - Vandalism,2014,6/19/2014,6/19/2014,Tennessee Valley Authority,"Nashville, Tennessee","vandalism, physical",08:47:00,08:48:00
...,...,...,...,...,...,...,...,...,...
1642,Severe Weather,2000,5/2/2000,5/2/2000,Reliant Energy HL&P,"Houston, TX",severe weather,04:00:00,12:00:00
1646,Transmission Line Loss,2000,3/18/2000,3/18/2000,Public Service of New Mexico,New Mexico,transmission interruption,19:08:00,19:08:00
1647,Transmission Line Loss,2000,3/18/2000,3/18/2000,El Paso Elec. Co.,Texas,transmission interruption,16:00:00,17:10:00
1649,Ice Storm,2000,1/29/2000,2/3/2000,Duke Power Co.,South Carolina,"severe weather, winter storm",22:00:00,12:00:00


In [11]:
city_counts = cleaned_power_outages['Geographic Areas'].value_counts()

# Print the counts for each city
print(city_counts)

Island of Puerto Rico              30
California                         24
Texas                              20
Northern California                20
Tacoma, Washington                 17
                                   ..
Northeast, Illinois                 1
Jefferson, Oregon                   1
Fredrickson, Washington             1
Entire ComEd Territory, Indiana     1
New Mexico                          1
Name: Geographic Areas, Length: 985, dtype: int64


In [12]:
top_20_cities = city_counts.head(20)
print("Top 20 cities:")
print(top_20_cities)

Top 20 cities:
Island of Puerto Rico                 30
California                            24
Texas                                 20
Northern California                   20
Tacoma, Washington                    17
Southeastern Michigan                 14
Illinois                              12
New York                              12
Southern California                   12
Northern and Central California       10
Georgia                               10
Pennsylvania                          10
Newark, Delaware                      10
San Francisco Bay Area, California    10
Northern Illinois                     10
Alberta, Canada                        9
Ohio                                   9
Virginia                               9
North Carolina                         8
Michigan                               8
Name: Geographic Areas, dtype: int64


In [13]:
# Split the "Event Description" column into two separate columns
cleaned_power_outages[['Category', 'Sub Category']] = cleaned_power_outages['Event Description'].str.split(' - ', 1, expand=True)
cleaned_power_outages.drop('Event Description', axis=1, inplace=True)

# Print the first few rows
cleaned_power_outages

  cleaned_power_outages[['Category', 'Sub Category']] = cleaned_power_outages['Event Description'].str.split(' - ', 1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_power_outages[['Category', 'Sub Category']] = cleaned_power_outages['Event Description'].str.split(' - ', 1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_power_outages[['Category', 'Sub Category']] = cleaned_power_outages['Event Description'].str.split(' - ', 1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the cav

Unnamed: 0,Year,Date Event Began,Date of Restoration,Respondent,Geographic Areas,Tags,Time Event Began,Time of Restoration,Category,Sub Category
0,2014,6/30/2014,7/2/2014,Exelon Corporation/ComEd,Illinois,"severe weather, thunderstorm",20:00:00,18:30:00,Severe Weather,Thunderstorms
1,2014,6/30/2014,7/1/2014,Northern Indiana Public Service Company,North Central Indiana,"severe weather, thunderstorm",23:20:00,17:00:00,Severe Weather,Thunderstorms
2,2014,6/30/2014,7/1/2014,We Energies,Southeast Wisconsin,"severe weather, thunderstorm",17:55:00,02:53:00,Severe Weather,Thunderstorms
4,2014,6/24/2014,6/24/2014,Tennessee Valley Authority,"Nashville, Tennessee","vandalism, physical",14:54:00,14:55:00,Physical Attack,Vandalism
5,2014,6/19/2014,6/19/2014,Tennessee Valley Authority,"Nashville, Tennessee","vandalism, physical",08:47:00,08:48:00,Physical Attack,Vandalism
...,...,...,...,...,...,...,...,...,...,...
1642,2000,5/2/2000,5/2/2000,Reliant Energy HL&P,"Houston, TX",severe weather,04:00:00,12:00:00,Severe Weather,
1646,2000,3/18/2000,3/18/2000,Public Service of New Mexico,New Mexico,transmission interruption,19:08:00,19:08:00,Transmission Line Loss,
1647,2000,3/18/2000,3/18/2000,El Paso Elec. Co.,Texas,transmission interruption,16:00:00,17:10:00,Transmission Line Loss,
1649,2000,1/29/2000,2/3/2000,Duke Power Co.,South Carolina,"severe weather, winter storm",22:00:00,12:00:00,Ice Storm,


In [14]:
cleaned_power_outages.drop('Year', axis=1, inplace=True)
cleaned_power_outages

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_power_outages.drop('Year', axis=1, inplace=True)


Unnamed: 0,Date Event Began,Date of Restoration,Respondent,Geographic Areas,Tags,Time Event Began,Time of Restoration,Category,Sub Category
0,6/30/2014,7/2/2014,Exelon Corporation/ComEd,Illinois,"severe weather, thunderstorm",20:00:00,18:30:00,Severe Weather,Thunderstorms
1,6/30/2014,7/1/2014,Northern Indiana Public Service Company,North Central Indiana,"severe weather, thunderstorm",23:20:00,17:00:00,Severe Weather,Thunderstorms
2,6/30/2014,7/1/2014,We Energies,Southeast Wisconsin,"severe weather, thunderstorm",17:55:00,02:53:00,Severe Weather,Thunderstorms
4,6/24/2014,6/24/2014,Tennessee Valley Authority,"Nashville, Tennessee","vandalism, physical",14:54:00,14:55:00,Physical Attack,Vandalism
5,6/19/2014,6/19/2014,Tennessee Valley Authority,"Nashville, Tennessee","vandalism, physical",08:47:00,08:48:00,Physical Attack,Vandalism
...,...,...,...,...,...,...,...,...,...
1642,5/2/2000,5/2/2000,Reliant Energy HL&P,"Houston, TX",severe weather,04:00:00,12:00:00,Severe Weather,
1646,3/18/2000,3/18/2000,Public Service of New Mexico,New Mexico,transmission interruption,19:08:00,19:08:00,Transmission Line Loss,
1647,3/18/2000,3/18/2000,El Paso Elec. Co.,Texas,transmission interruption,16:00:00,17:10:00,Transmission Line Loss,
1649,1/29/2000,2/3/2000,Duke Power Co.,South Carolina,"severe weather, winter storm",22:00:00,12:00:00,Ice Storm,


In [15]:
# Get the top 20 cities
top_20_cities = city_counts.head(20).index

# Create a boolean mask indicating whether each row's city is in the top 20 cities
mask = cleaned_power_outages['Geographic Areas'].isin(top_20_cities)

# Filter the DataFrame to keep only the rows with cities in the top 20
df_top_20 = cleaned_power_outages[mask]

# Print the resulting DataFrame
print("DataFrame with rows for the top 20 cities only:")
df_top_20


DataFrame with rows for the top 20 cities only:


Unnamed: 0,Date Event Began,Date of Restoration,Respondent,Geographic Areas,Tags,Time Event Began,Time of Restoration,Category,Sub Category
0,6/30/2014,7/2/2014,Exelon Corporation/ComEd,Illinois,"severe weather, thunderstorm",20:00:00,18:30:00,Severe Weather,Thunderstorms
13,6/9/2014,6/9/2014,Peak Reliability,"Alberta, Canada",islanding,11:07:00,11:30:00,Electrical System Islanding,
18,6/3/2014,6/3/2014,Peak Reliability,"Alberta, Canada",islanding,15:32:00,15:59:00,Electrical System Islanding,
19,6/3/2014,6/3/2014,Lower Colorado River Authority,Texas,"vandalism, physical",01:38:00,01:43:00,Physical Attack,Vandalism
22,5/23/2014,5/25/2014,Duke Energy Progress,North Carolina,"vandalism, physical",15:00:00,19:00:00,Physical Attack,Vandalism
...,...,...,...,...,...,...,...,...,...
1628,8/6/2000,8/7/2000,Commonwealth Edison,Illinois,severe weather,16:00:00,12:00:00,Severe Weather,
1636,5/25/2000,6/2/2000,Duke Power,North Carolina,severe weather,10:00:00,06:00:00,Severe Weather,
1637,5/24/2000,5/25/2000,Entergy,Texas,voltage reduction,10:15:00,22:14:00,Voltage Elec Usage,
1640,5/9/2000,5/9/2000,Consolidated Edison Co. of New York,New York,load shedding,11:39:00,23:00:00,Energy Conservation,


In [16]:
# Filter rows containing 'weather' in the 'Category' column
df_top_20 = df_top_20[df_top_20['Category'].str.contains('weather', case=False, na=False)]

# Reset index if needed
df_top_20.reset_index(drop=True, inplace=True)

In [17]:
df_top_20

Unnamed: 0,Date Event Began,Date of Restoration,Respondent,Geographic Areas,Tags,Time Event Began,Time of Restoration,Category,Sub Category
0,6/30/2014,7/2/2014,Exelon Corporation/ComEd,Illinois,"severe weather, thunderstorm",20:00:00,18:30:00,Severe Weather,Thunderstorms
1,4/12/2014,4/15/2014,Detroit Edison Company,Michigan,severe weather,20:00:00,19:30:00,Severe Weather,
2,3/12/2014,3/13/2014,Duke Energy Carolinas,North Carolina,"severe weather, wind",19:35:00,12:00:00,Severe Weather,High Winds
3,2/12/2014,2/15/2014,Duke Energy Progress,North Carolina,"severe weather, winter storm",12:10:00,15:20:00,Severe Weather,Snow/Ice
4,2/5/2014,2/9/2014,FirstEnergy Corp: Met-Ed,Pennsylvania,"severe weather, winter storm",01:00:00,20:40:00,Severe Weather,Snow/Ice
5,2/5/2014,2/5/2014,Exelon Corporation/PECO,Pennsylvania,"severe weather, winter storm",05:00:00,05:01:00,Severe Weather,Snow/Ice
6,1/7/2014,1/8/2014,Duke Energy Progress,North Carolina,"severe weather, cold",16:15:00,13:20:00,Public Appeal due to Severe Weather,Cold
7,1/7/2014,1/7/2014,Duke Energy Progress,North Carolina,"severe weather, cold, public appeal",07:58:00,11:00:00,Voltage Reduction; Public Appeal due to Severe...,Cold
8,1/7/2014,1/7/2014,PJM Interconnection,Pennsylvania,"severe weather, cold",06:18:00,06:19:00,Severe Weather,Cold
9,1/6/2014,1/6/2014,"UGI Utilities, Inc",Pennsylvania,"severe weather, cold",19:50:00,20:49:00,Voltage Reduction due to Severe Weather,Cold


In [None]:
df_top_20.to_csv("PowerOuta.csv", index=False)