# Figures for IPSN 

Structure of the notebook: 

1) Clustering and analysis for DB, Agglom, STDB which are ultimately stored in the dataframes: *db, agglom, stdb*

2) Plots that Josh and Noah requested


Disclaimers:
- I plotted each clustering method separately instead of all compiled on one graph 
- Agglom logical distance hasn't been computed because the agglomerative csv I pulled doesn't have core_id's and I didn't want to mess with it 
- be wary about the order in which you run cells 
    - (the code is super repetitive since I pulled from a bunch of different notebooks, but I did my best to make it user friendly)
    
Let me know if you run into any problems! - Emily 

In [106]:
#pip install all of these libraries before beginning 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import time 
import datetime
from datetime import datetime
from datetime import timezone
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn import metrics 
import re
import pylab
from scipy.stats import norm
import geopandas as gpd 
import shapely.geometry as geometry
import shapely.ops as ops 
from functools import partial 
import pyproj

In [2]:
#these are the transition points of the outages from July 2018 pulled from outage_aggregator.py
outages = pd.read_parquet('part-00000-3c7aa0ea-41c7-4705-bafc-5662f2051563-c000.gz.parquet')
outages.head()

In [95]:
#this is all of the data from July 2018 (not just outage transition points!) 
#only use this for SAIFI and convex hull calculations 
pw = pd.read_parquet('part-00000-602cb425-c6be-40be-8024-aeb92fcb4315-c000.gz.parquet').drop(['product_id', 'millis', 'last_plug_millis', 'last_unplug_millis'], axis=1)
pw_powered = pw[~pw['is_powered']]
pw.head()

In [26]:
#logical grid distance csv 
logical = pd.read_csv('/Users/emilypaszkiewicz17/gridwatch-data-analysis/grid_distance.csv')
logical = logical.reset_index()
logical

# DBSCAN(time only): Clustering and Analysis 

In [55]:
#DBSCAN(time) on a month's worth of data 
month_out = outages
month_out['z'] = 0
X=month_out[['outage_time', 'z']]
out_cluster = StandardScaler().fit_transform(X)
db = DBSCAN(eps=0.0001, algorithm='ball_tree', min_samples=2).fit(out_cluster)
labels = db.labels_
no_noise = list(labels).count(-1)
no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
no_noise, no_clusters

In [56]:
month_out['labels'] =labels 
month_out['core_id'] = outages['core_id']
month_out.head()
month_clust = pd.DataFrame(month_out.groupby(['labels'])['outage_time'].apply(lambda x: x.tolist()))
month_clust['latitude'] = month_out.groupby(['labels'])['location_latitude'].apply(lambda x: x.tolist()).values
month_clust['longitude'] = month_out.groupby(['labels'])['location_longitude'].apply(lambda x: x.tolist()).values
month_clust['ids'] = month_out.groupby(['labels'])['core_id'].apply(lambda x: x.tolist()).values
month_clust = month_clust.iloc[1:]

def find_range(lst):
    return max(lst) - min(lst)

month_clust.head()
month_clust['time_range'] = (np.vectorize(find_range)(month_clust['outage_time']))
month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))

month_clust['cluster_size'] = month_out.groupby(['labels'])['outage_time'].count()
month_clust['min_time'] = month_clust['outage_time'].apply(lambda x: min(x))
month_clust['max_time'] = month_clust['outage_time'].apply(lambda x: max(x))


month_clust.head()

In [57]:
#FYI this cell will take a few mins to run on a month's worth of data

#check if powered points are within the time range of the clusters 
#we will start by computing the lat and long values for powered sensors that reported within the time range of the outage
#then, later we will compare these powered coords to see if they are also within the convex hull of the outage
powered = pw[~pw['is_powered']]
powered['time'] = powered['time'].apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

within_time_lat = []
within_time_long = []
for o_index in range(len(month_clust)): 
    nest_lat = []
    nest_long= []
    for p_index in range(len(powered)): 
        if powered['time'].values[p_index] >= month_clust['min_time'].values[o_index] and powered['time'].values[p_index] <= month_clust['max_time'].values[o_index]:
            nest_lat.append(powered['location_latitude'].values[p_index])
            nest_long.append(powered['location_longitude'].values[p_index])        
    within_time_lat.append(nest_lat)
    within_time_long.append(nest_long)
            
month_clust['within_time_lat'] = within_time_lat
month_clust['within_time_long'] = within_time_long






#for clusters of 2, duplicate the lat and long points so that these points can also be converted into a Polygon by geopandas 
update_lat = month_clust['latitude'].copy()
update_long = month_clust['longitude'].copy()
update_within_lat = month_clust['within_time_lat'].copy()
update_within_long = month_clust['within_time_long'].copy()

for i in range(len(month_clust)): 
    if len(month_clust.iloc[i]['within_time_lat']) == 0: 
        update_within_lat.values[i] = [0, 1, 2]
        update_within_long.values[i] = [0, 1, 2] 
    if len(update_within_lat.values[i]) < 3: 
        update_within_lat.values[i] = month_clust.iloc[i]['within_time_lat']*3
        update_within_long.values[i] = month_clust.iloc[i]['within_time_long']*3        
    if month_clust.iloc[i]['cluster_size'] < 3: 
        update_lat.values[i] = month_clust.iloc[i]['latitude']*2
        update_long.values[i] = month_clust.iloc[i]['longitude']*2

month_clust['latitude'] = update_lat
month_clust['longitude'] = update_long
month_clust['within_time_long'] = update_within_long
month_clust['within_time_lat'] = update_within_lat





#create geodataframes to calculate convex hull 
power = month_clust.copy()
out = month_clust.copy()
powered_poly = []
outage_poly = []
for i in range(len(month_clust)):
    a = month_clust.iloc[i, :]['within_time_long']
    b = month_clust.iloc[i, :]['within_time_lat']
    c = month_clust.iloc[i, :]['longitude']
    d = month_clust.iloc[i, :]['latitude']
    powered_poly.append(list(zip(a, b)))
    outage_poly.append(list(zip(c, d)))
    
def unique_coords(coords):
    return pd.Series(coords).unique()

power['powered_poly'] = powered_poly
out['powered_poly'] = powered_poly
month_clust['powered_poly'] = powered_poly
out['outage_poly'] = outage_poly
power['outage_poly'] = outage_poly
month_clust['outage_poly'] = outage_poly
crs = {'init', 'epsg:4326'}

powered_poly = [geometry.Polygon(x, holes=None) for x in power['powered_poly']]
power = gpd.GeoDataFrame(power, crs=crs, geometry=(powered_poly))

outage_poly = [geometry.Polygon(x, holes=None) for x in out['outage_poly']]
out= gpd.GeoDataFrame(out, crs=crs, geometry=(outage_poly))


power['powered_poly'] = (np.vectorize(unique_coords)(power['powered_poly']))
out['powered_poly'] = (np.vectorize(unique_coords)(out['powered_poly']))
month_clust['powered_poly'] = (np.vectorize(unique_coords)(month_clust['powered_poly']))
out['outage_poly'] = (np.vectorize(unique_coords)(out['outage_poly']))
power['outage_poly'] = (np.vectorize(unique_coords)(power['outage_poly']))
month_clust['outage_poly'] = (np.vectorize(unique_coords)(month_clust['outage_poly']))

power['convex_area_powered'] = power.convex_hull
out['convex_area_outage'] = out.convex_hull



#calculate the convex hull 
def in_convex_hull(powered_coords, geom):
#takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
out['powered_within_outage'] = in_convex_hull

def outage_size(outage_coords): 
    return len(pd.Series(outage_coords).unique())

out['powered_size_within_outage_area'] = (np.vectorize(outage_size)(out['powered_within_outage']))
out['percent_pow_within_outage'] = (out['powered_size_within_outage_area'] / (out['powered_size_within_outage_area'] + out['cluster_size']))*100

db = out 

# Agglomerative: Cleaning and Analysis 

In [8]:
#reading outages from the pw_finalized_with_string dataframe from outage_aggregator.py and doing some data cleaning 
spark_outages = pd.read_parquet('part-00000-1a77f616-ace0-482c-9ad1-bdc53a8286bc-c000.gz.parquet')

spark_outages = spark_outages[spark_outages['cluster_size'] > 1]
spark_day = spark_outages
spark_day['outage_times'] = spark_day['outage_times'].apply(lambda x: re.findall('\d+', x))
spark_day['location'] = spark_day['location'].apply(lambda x: re.findall('\d.\d+', x))
spark_day_exploded = spark_day.explode('outage_times')
spark_day_exploded['outage_times'] = spark_day_exploded['outage_times'].apply(lambda x: int(x))
unexploded = spark_day_exploded.groupby('outage_time')['outage_times'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')
unexploded['location'] = spark_day.sort_values('outage_time')['location'].values

explode_loc = unexploded.explode('location')
explode_loc['location'] = explode_loc['location'].apply(lambda x: float(x))
lat = explode_loc[explode_loc['location'] > 1]
long = explode_loc[explode_loc['location'] < 1]
lat['latitude'] = lat['location']
long['longitude'] = long['location']*(-1)
lat = lat.groupby('outage_time')['latitude'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')
lat['longitude'] = long.groupby('outage_time')['longitude'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')['longitude']
lat['cluster_size'] = spark_day.sort_values('outage_time')['cluster_size'].values
lat['outage_times_stddev'] = spark_day.sort_values('outage_time')['outage_times_stddev'].values
lat['range'] = spark_day.sort_values('outage_time')['outage_times_range'].values
lat['outage_times'] = unexploded.sort_values('outage_time')['outage_times']
spark_day = lat
spark_day['min_time'] = spark_day['outage_times'].apply(lambda x: min(x))
spark_day['max_time'] = spark_day['outage_times'].apply(lambda x: max(x))

spark_day.head()

In [9]:
#FYI this cell will take a few mins to run on a month's worth of data

#Now let's compute the number of sensors within the convex hull of the outage 
#we will start by computing the lat and long values for powered sensors that reported within the time range of the outage
#then, later we will compare these powered coords to see if they are also within the convex hull of the outage 
powered = pw[~pw['is_powered']]
powered['time'] = powered['time'].apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

within_time_lat = []
within_time_long = []

for o_index in range(len(spark_day)): 
    nest_lat = []
    nest_long= []
    for p_index in range(len(powered)): 
        if powered['time'].values[p_index] >= spark_day['min_time'].values[o_index] and powered['time'].values[p_index] <= spark_day['max_time'].values[o_index]:
            nest_lat.append(powered['location_latitude'].values[p_index])
            nest_long.append(powered['location_longitude'].values[p_index])        
    within_time_lat.append(nest_lat)
    within_time_long.append(nest_long)
            
spark_day['within_time_lat'] = within_time_lat
spark_day['within_time_long'] = within_time_long




#for clusters of 2, duplicate the lat and long points so that these points can also be converted into a Polygon by geopandas 
update_lat = spark_day['latitude'].copy()
update_long = spark_day['longitude'].copy()
update_within_lat = spark_day['within_time_lat'].copy()
update_within_long = spark_day['within_time_long'].copy()

for i in range(len(spark_day)): 
    if len(spark_day.iloc[i]['within_time_lat']) == 0: 
        update_within_lat.values[i] = [0, 1, 2]
        update_within_long.values[i] = [0, 1, 2] 
    if len(update_within_lat.values[i]) < 3: 
        update_within_lat.values[i] = spark_day.iloc[i]['within_time_lat']*3
        update_within_long.values[i] = spark_day.iloc[i]['within_time_long']*3        
    if spark_day.iloc[i]['cluster_size'] < 3: 
        update_lat.values[i] = spark_day.iloc[i]['latitude']*2
        update_long.values[i] = spark_day.iloc[i]['longitude']*2

spark_day['latitude'] = update_lat
spark_day['longitude'] = update_long
spark_day['within_time_long'] = update_within_long
spark_day['within_time_lat'] = update_within_lat




#create geodataframes to calculate convex hull 
power = spark_day.copy()
out = spark_day.copy()
powered_poly = []
outage_poly = []
for i in range(len(spark_day)):
    a = spark_day.iloc[i, :]['within_time_long']
    b = spark_day.iloc[i, :]['within_time_lat']
    c = spark_day.iloc[i, :]['longitude']
    d = spark_day.iloc[i, :]['latitude']
    powered_poly.append(list(zip(a, b)))
    outage_poly.append(list(zip(c, d)))
    
def unique_coords(coords):
    return pd.Series(coords).unique()

power['powered_poly'] = powered_poly
out['powered_poly'] = powered_poly
spark_day['powered_poly'] = powered_poly
out['outage_poly'] = outage_poly
power['outage_poly'] = outage_poly
spark_day['outage_poly'] = outage_poly
crs = {'init', 'epsg:4326'}

powered_poly = [geometry.Polygon(x, holes=None) for x in power['powered_poly']]
power = gpd.GeoDataFrame(power, crs=crs, geometry=(powered_poly))

outage_poly = [geometry.Polygon(x, holes=None) for x in out['outage_poly']]
out= gpd.GeoDataFrame(out, crs=crs, geometry=(outage_poly))


power['powered_poly'] = (np.vectorize(unique_coords)(power['powered_poly']))
out['powered_poly'] = (np.vectorize(unique_coords)(out['powered_poly']))
spark_day['powered_poly'] = (np.vectorize(unique_coords)(spark_day['powered_poly']))
out['outage_poly'] = (np.vectorize(unique_coords)(out['outage_poly']))
power['outage_poly'] = (np.vectorize(unique_coords)(power['outage_poly']))
spark_day['outage_poly'] = (np.vectorize(unique_coords)(spark_day['outage_poly']))

power['convex_area_powered'] = power.convex_hull
out['convex_area_outage'] = out.convex_hull

#calculate the convex hull 
def in_convex_hull(powered_coords, geom):
#takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
out['powered_within_outage'] = in_convex_hull


#calculate the convex hull 
def in_convex_hull(powered_coords, geom):
#takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
out['powered_within_outage'] = in_convex_hull

def outage_size(outage_coords): 
    return len(pd.Series(outage_coords).unique())

out['powered_size_within_outage_area'] = (np.vectorize(outage_size)(out['powered_within_outage']))
out['percent_pow_within_outage'] = (out['powered_size_within_outage_area'] / (out['powered_size_within_outage_area'] + out['cluster_size']))*100


agglom = out

# STDBSCAN: Clustering and Analysis 

In [16]:
#run this to initiate the STDBSCAN class 

class STDBSCAN(object):

    def __init__(self, col_lat, col_lon, col_time, spatial_threshold=500.0, 
                 temporal_threshold=60.0, min_neighbors=15):
        """
        Python st-dbscan implementation.
        :param col_lat: Latitude column name;
        :param col_lon:  Longitude column name;
        :param col_time: Date time column name;
        :param spatial_threshold: Maximum geographical coordinate (spatial)
             distance value (meters);
        :param temporal_threshold: Maximum non-spatial distance value (seconds);
        :param min_neighbors: Minimum number of points within Eps1 and Eps2
             distance;
        """
        self.col_lat = col_lat
        self.col_lon = col_lon
        self.col_time = col_time
        self.spatial_threshold = spatial_threshold
        self.temporal_threshold = temporal_threshold
        self.min_neighbors = min_neighbors

    def projection(self, df, p1_str='epsg:4326', p2_str='epsg:3395'):
        """
        Cython wrapper to converts from geographic (longitude,latitude)
        to native map projection (x,y) coordinates. It needs to select the
        right epsg. Values of x and y are given in meters
        """
        p1 = pyproj.Proj(init=p1_str)
        p2 = pyproj.Proj(init=p2_str)
        lon = df[self.col_lon].values
        lat = df[self.col_lat].values
        x1, y1 = p1(lon, lat)
        x2, y2 = pyproj.transform(p1, p2, x1, y1, radians=True)
        df[self.col_lon] = x2
        df[self.col_lat] = y2

        print(df)
        return df

    def _retrieve_neighbors(self, index_center, matrix):

        center_point = matrix[index_center, :]

        # filter by time
        min_time = center_point[2] - timedelta(seconds=self.temporal_threshold)
        max_time = center_point[2] + timedelta(seconds=self.temporal_threshold)
        matrix = matrix[(matrix[:, 2] >= min_time) &
                        (matrix[:, 2] <= max_time), :]
        # filter by distance
        tmp = (matrix[:, 0]-center_point[0])*(matrix[:, 0]-center_point[0]) + \
            (matrix[:, 1]-center_point[1])*(matrix[:, 1]-center_point[1])
        neigborhood = matrix[tmp <= (
            self.spatial_threshold*self.spatial_threshold), 4].tolist()
        neigborhood.remove(index_center)

        return neigborhood

    def run(self, df):
        """
        INPUTS:
            df={o1,o2,...,on} Set of objects;
        OUTPUT:
            C = {c1,c2,...,ck} Set of clusters
        """
        cluster_label = 0
        noise = -1
        unmarked = 777777
        stack = []

        # initial setup
        df = df[[self.col_lon, self.col_lat, self.col_time]]
        df = df.assign(cluster=unmarked)
        df['index'] = range(df.shape[0])
        matrix = df.values
        df.drop(['index'], inplace=True, axis=1)

        # for each point in database
        for index in range(matrix.shape[0]):
            if matrix[index, 3] == unmarked:
                neighborhood = self._retrieve_neighbors(index, matrix)

                if len(neighborhood) < self.min_neighbors:
                    matrix[index, 3] = noise
                else:  # found a core point
                    cluster_label += 1
                    # assign a label to core point
                    matrix[index, 3] = cluster_label

                    # assign core's label to its neighborhood
                    for neig_index in neighborhood:
                        matrix[neig_index, 3] = cluster_label
                        stack.append(neig_index)  # append neighbors to stack

                    # find new neighbors from core point neighborhood
                    while len(stack) > 0:
                        current_point_index = stack.pop()
                        new_neighborhood = \
                            self._retrieve_neighbors(current_point_index,
                                                     matrix)

                        # current_point is a new core
                        if len(new_neighborhood) >= self.min_neighbors:
                            for neig_index in new_neighborhood:
                                neig_cluster = matrix[neig_index, 3]
                                if any([neig_cluster == noise,
                                        neig_cluster == unmarked]):
                                    matrix[neig_index, 3] = cluster_label
                                    stack.append(neig_index)

        df['cluster'] = matrix[:, 3]
        return df

In [17]:
#this is where you actually adjust the parameters 

def test_time(df):
    '''
    transfrom the lon and lat to x and y
    need to select the right epsg
    I don't the true epsg of sample, but get the same result by using 
    epsg:4326 and epsg:32635
    '''
    st_dbscan = STDBSCAN(col_lat='location_latitude', col_lon='location_longitude',
                         col_time='time', spatial_threshold=0.03,
                         temporal_threshold=60, min_neighbors=1)
    #df = st_dbscan.projection(df, p1_str='epsg:4326', p2_str='epsg:32630')
    return st_dbscan.run(df)


In [24]:
#these are the transition points of the outages 
outages = pd.read_parquet('part-00000-3c7aa0ea-41c7-4705-bafc-5662f2051563-c000.gz.parquet')
outages['time'] = outages['outage_time'].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
outages['time'] = pd.to_datetime(outages['time'], infer_datetime_format=True)

stdb_clustered = pd.DataFrame(test_time(outages))
stdb_clustered['core_id'] = outages['core_id']

def find_range(lst):
    return max(lst) - min(lst)

stdb_clustered['time'] = stdb_clustered['time'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))
month_clust = pd.DataFrame(stdb_clustered.groupby(['cluster'])['time'].apply(lambda x: x.tolist()))
month_clust['latitude'] = stdb_clustered.groupby(['cluster'])['location_latitude'].apply(lambda x: x.tolist()).values
month_clust['longitude'] = stdb_clustered.groupby(['cluster'])['location_longitude'].apply(lambda x: x.tolist()).values
month_clust['ids'] = stdb_clustered.groupby(['cluster'])['core_id'].apply(lambda x: x.tolist()).values
month_clust = month_clust.iloc[1:]

month_clust['time_range'] = (np.vectorize(find_range)(month_clust['time']))
month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))

month_clust['cluster_size'] = stdb_clustered.groupby(['cluster'])['time'].count()
month_clust['min_time'] = month_clust['time'].apply(lambda x: min(x))
month_clust['max_time'] = month_clust['time'].apply(lambda x: max(x))

month_clust.head()

In [25]:
#FYI this cell will take a few mins to run on a month's worth of data

#check if powered points are within the time range of the clusters 
#we will start by computing the lat and long values for powered sensors that reported within the time range of the outage
#then, later we will compare these powered coords to see if they are also within the convex hull of the outage
powered = pw[~pw['is_powered']]
powered['time'] = powered['time'].apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

within_time_lat = []
within_time_long = []
for o_index in range(len(month_clust)): 
    nest_lat = []
    nest_long= []
    for p_index in range(len(powered)): 
        if powered['time'].values[p_index] >= month_clust['min_time'].values[o_index] and powered['time'].values[p_index] <= month_clust['max_time'].values[o_index]:
            nest_lat.append(powered['location_latitude'].values[p_index])
            nest_long.append(powered['location_longitude'].values[p_index])        
    within_time_lat.append(nest_lat)
    within_time_long.append(nest_long)
            
month_clust['within_time_lat'] = within_time_lat
month_clust['within_time_long'] = within_time_long






#for clusters of 2, duplicate the lat and long points so that these points can also be converted into a Polygon by geopandas 
update_lat = month_clust['latitude'].copy()
update_long = month_clust['longitude'].copy()
update_within_lat = month_clust['within_time_lat'].copy()
update_within_long = month_clust['within_time_long'].copy()

for i in range(len(month_clust)): 
    if len(month_clust.iloc[i]['within_time_lat']) == 0: 
        update_within_lat.values[i] = [0, 1, 2]
        update_within_long.values[i] = [0, 1, 2] 
    if len(update_within_lat.values[i]) < 3: 
        update_within_lat.values[i] = month_clust.iloc[i]['within_time_lat']*3
        update_within_long.values[i] = month_clust.iloc[i]['within_time_long']*3        
    if month_clust.iloc[i]['cluster_size'] < 3: 
        update_lat.values[i] = month_clust.iloc[i]['latitude']*2
        update_long.values[i] = month_clust.iloc[i]['longitude']*2

month_clust['latitude'] = update_lat
month_clust['longitude'] = update_long
month_clust['within_time_long'] = update_within_long
month_clust['within_time_lat'] = update_within_lat





#create geodataframes to calculate convex hull 
power = month_clust.copy()
out = month_clust.copy()
powered_poly = []
outage_poly = []
for i in range(len(month_clust)):
    a = month_clust.iloc[i, :]['within_time_long']
    b = month_clust.iloc[i, :]['within_time_lat']
    c = month_clust.iloc[i, :]['longitude']
    d = month_clust.iloc[i, :]['latitude']
    powered_poly.append(list(zip(a, b)))
    outage_poly.append(list(zip(c, d)))
    
def unique_coords(coords):
    return pd.Series(coords).unique()

power['powered_poly'] = powered_poly
out['powered_poly'] = powered_poly
month_clust['powered_poly'] = powered_poly
out['outage_poly'] = outage_poly
power['outage_poly'] = outage_poly
month_clust['outage_poly'] = outage_poly
crs = {'init', 'epsg:4326'}

powered_poly = [geometry.Polygon(x, holes=None) for x in power['powered_poly']]
power = gpd.GeoDataFrame(power, crs=crs, geometry=(powered_poly))

outage_poly = [geometry.Polygon(x, holes=None) for x in out['outage_poly']]
out= gpd.GeoDataFrame(out, crs=crs, geometry=(outage_poly))


power['powered_poly'] = (np.vectorize(unique_coords)(power['powered_poly']))
out['powered_poly'] = (np.vectorize(unique_coords)(out['powered_poly']))
month_clust['powered_poly'] = (np.vectorize(unique_coords)(month_clust['powered_poly']))
out['outage_poly'] = (np.vectorize(unique_coords)(out['outage_poly']))
power['outage_poly'] = (np.vectorize(unique_coords)(power['outage_poly']))
month_clust['outage_poly'] = (np.vectorize(unique_coords)(month_clust['outage_poly']))

power['convex_area_powered'] = power.convex_hull
out['convex_area_outage'] = out.convex_hull



#calculate the convex hull 
def in_convex_hull(powered_coords, geom):
#takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
out['powered_within_outage'] = in_convex_hull

def outage_size(outage_coords): 
    return len(pd.Series(outage_coords).unique())

out['powered_size_within_outage_area'] = (np.vectorize(outage_size)(out['powered_within_outage']))
out['percent_pow_within_outage'] = (out['powered_size_within_outage_area'] / (out['powered_size_within_outage_area'] + out['cluster_size']))*100


stdb = out 

# Plot 1: Trimodal Distribution 

In [73]:
#DBSCAN
db_clust_sizes = pd.DataFrame(db.groupby('cluster_size')['ids'].nunique()).reset_index()
sns.barplot(x='cluster_size', y='ids', data=db_clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('DBSCAN: Cluster Size vs. Number of Clusters of this Size')

In [77]:
#AGGLOM
agglom_clust_sizes = pd.DataFrame(agglom.groupby('cluster_size')['latitude'].nunique()).reset_index()
sns.barplot(x='cluster_size', y='latitude', data=agglom_clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('Agglomerative: Cluster Size vs. Number of Clusters of this Size')

In [76]:
#STDBSCAN 
stdb_clust_sizes = pd.DataFrame(stdb.groupby('cluster_size')['ids'].nunique()).reset_index()
sns.barplot(x='cluster_size', y='ids', data=stdb_clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('STDBSCAN: Cluster Size vs. Number of Clusters of this Size')

# Plot 2a: Low Voltage Success (Euclidean)

In [78]:
#DBSCAN Euclidean 

#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
c = month_out.groupby('labels').count()
pair_index = c[c['time'] == 2].index

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for clusters of 2: 
pairs = month_out[month_out['labels'].isin(pair_index)]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.location_longitude, pairs.location_latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('labels')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('labels')['geometry'].last(), crs={'init':'aea'})
db_distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#distances for clusters of 3: 
trio_index = c[c['time'] == 3].index
trios = month_out[month_out['labels'].isin(trio_index)]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.location_longitude, trios.location_latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].last(), crs={'init':'aea'})

db_trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
db_trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
db_trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))




#percentage under 550m for pairs: 
db_pair_percent_under_550= len(db_distances[db_distances < 550])/len(db_distances)

#percentage under 550m for trios: 
db_dist_for_3 = list(db_trios_distances['1->2'].values) + list(db_trios_distances['2->3'].values) + list(db_trios_distances['3->1'].values)
db_dist_for_3 = np.array(db_dist_for_3)
db_trio_percent_under_550 = len(db_dist_for_3[db_dist_for_3 < 550])/len(db_dist_for_3)

db_pair_percent_under_550, db_trio_percent_under_550

In [44]:
#AGGLOMERATIVE EUCLIDEAN

#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
t = spark_outages[spark_outages['cluster_size'] <= 3]
explode_loc = t.explode('location')
explode_loc['location'] = explode_loc['location'].apply(lambda x: float(x))
lat = explode_loc[explode_loc['location'] > 1]
long = explode_loc[explode_loc['location'] < 1]
lat['latitude'] = lat['location']
t = lat[['outage_time', 'outage_times', 'cluster_size', 'latitude']]
t['longitude'] = long['location']*(-1)

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for sensors of cluster size 2 
pairs = t[t['cluster_size'] == 2]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.longitude, pairs.latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('outage_time')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('outage_time')['geometry'].last(), crs={'init':'aea'})
agglom_distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#dustances for sensors of cluster size 3 
trios = t[t['cluster_size'] == 3]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.longitude, trios.latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].last(), crs={'init':'aea'})

agglom_trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
agglom_trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
agglom_trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))
agglom_trios_distances

#I would use agglom_trios_distances_list for measuring clustering success 
agglom_dist_for_3 = list(agglom_trios_distances['1->2'].values) + list(agglom_trios_distances['2->3'].values) + list(agglom_trios_distances['3->1'].values)

#now let's caclulate the percentage within 550 m 
#percentage under the cutoff for pairs: 
agglom_pair_percent_under_550 = len(agglom_distances[agglom_distances < 550])/len(agglom_distances)

#percentage under the cutoff for trios: 
agglom_dist_for_3 = np.array(agglom_dist_for_3)
agglom_trio_percent_under_550 = len(agglom_dist_for_3[agglom_dist_for_3 < 550])/len(agglom_dist_for_3)

agglom_pair_percent_under_550, agglom_trio_percent_under_550

In [45]:
#STDBSCAN Euclidean 

#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
c = stdb_clustered.groupby('cluster').count()
pair_index = c[c['time'] == 2].index

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for clusters of 2: 
pairs = stdb_clustered[stdb_clustered['cluster'].isin(pair_index)]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.location_longitude, pairs.location_latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('cluster')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('cluster')['geometry'].last(), crs={'init':'aea'})
stdb_distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#distances for clusters of 3: 
trio_index = c[c['time'] == 3].index
trios = stdb_clustered[stdb_clustered['cluster'].isin(trio_index)]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.location_longitude, trios.location_latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].last(), crs={'init':'aea'})

stdb_trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
stdb_trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
stdb_trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))




#percentage under 550m for pairs: 
stdb_pair_percent_under_550= len(stdb_distances[stdb_distances < 550])/len(stdb_distances)

#percentage under 550m for trios: 
stdb_dist_for_3 = list(stdb_trios_distances['1->2'].values) + list(stdb_trios_distances['2->3'].values) + list(stdb_trios_distances['3->1'].values)
stdb_dist_for_3 = np.array(stdb_dist_for_3)
stdb_trio_percent_under_550 = len(stdb_dist_for_3[stdb_dist_for_3 < 550])/len(stdb_dist_for_3)

stdb_pair_percent_under_550, stdb_trio_percent_under_550

# Plot 2b: Low Voltage Success (Logical)

In [60]:
#DBSCAN Logical Grid 

two_ids = db[db['cluster_size'] == 2]
three_ids =  db[db['cluster_size'] == 3]
pair_logical_dist=[]
trio_logical_dist_1=[]
trio_logical_dist_2=[]
trio_logical_dist_3=[]
for i in range(len(two_ids)): 
    id_1 = two_ids['ids'].values[i][0]
    id_2 = two_ids['ids'].values[i][1]
    pair_logical_dist.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])

for i in range(len(three_ids)):
    id_1 = three_ids['ids'].values[i][0]
    id_2 = three_ids['ids'].values[i][1]
    id_3 = three_ids['ids'].values[i][2]
    trio_logical_dist_1.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])
    trio_logical_dist_2.append(logical[(logical['level_0'] == id_2) & (logical['level_1'] == id_3)]['logical_grid_distance'].values[0])
    trio_logical_dist_3.append(logical[(logical['level_0'] == id_3) & (logical['level_1'] == id_1)]['logical_grid_distance'].values[0])

    
two_ids['logical_distance']= pair_logical_dist
three_ids['log_dist_1'] = trio_logical_dist_1
three_ids['log_dist_2'] = trio_logical_dist_2
three_ids['log_dist_3'] = trio_logical_dist_3
db_logical_pairs = two_ids 
db_logical_trios = three_ids
two_ids


#calculate the % of outage pairs that are under the same transformer 
db_pair_percent_under_same_transformer = len(db_logical_pairs[db_logical_pairs['logical_distance'] ==1])/len(db_logical_pairs)

#calculate the % of outage trios that are under the same transformer 
log_dist_for_3 = list(db_logical_trios['log_dist_1']) + list(db_logical_trios['log_dist_2']) + list(db_logical_trios['log_dist_3'])
log_dist_for_3 = pd.Series(log_dist_for_3)
db_trio_percent_under_same_transformer = len(log_dist_for_3[log_dist_for_3 == 1])/len(log_dist_for_3)

db_pair_percent_under_same_transformer, db_trio_percent_under_same_transformer

In [79]:
#Agglom Logical Grid 

#we have to wait for agglom to have core_id's in order to calculate logical grid dist 
#this needs to happen either by merging dataframes or by adding them in spark
#once you have the core_id's you should be able to easily copy and paste the code for the DBSCANs

In [61]:
#STDBSCAN Logical Grid 

two_ids = stdb[stdb['cluster_size'] == 2]
three_ids =  stdb[stdb['cluster_size'] == 3]
pair_logical_dist=[]
trio_logical_dist_1=[]
trio_logical_dist_2=[]
trio_logical_dist_3=[]
for i in range(len(two_ids)): 
    id_1 = two_ids['ids'].values[i][0]
    id_2 = two_ids['ids'].values[i][1]
    pair_logical_dist.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])

for i in range(len(three_ids)):
    id_1 = three_ids['ids'].values[i][0]
    id_2 = three_ids['ids'].values[i][1]
    id_3 = three_ids['ids'].values[i][2]
    trio_logical_dist_1.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])
    trio_logical_dist_2.append(logical[(logical['level_0'] == id_2) & (logical['level_1'] == id_3)]['logical_grid_distance'].values[0])
    trio_logical_dist_3.append(logical[(logical['level_0'] == id_3) & (logical['level_1'] == id_1)]['logical_grid_distance'].values[0])

    
two_ids['logical_distance']= pair_logical_dist
three_ids['log_dist_1'] = trio_logical_dist_1
three_ids['log_dist_2'] = trio_logical_dist_2
three_ids['log_dist_3'] = trio_logical_dist_3
stdb_logical_pairs = two_ids 
stdb_logical_trios = three_ids


#calculate the % of outage pairs that are under the same transformer 
stdb_pair_percent_under_same_transformer = len(stdb_logical_pairs[stdb_logical_pairs['logical_distance'] ==1])/len(stdb_logical_pairs)

#calculate the % of outage trios that are under the same transformer 
log_dist_for_3 = list(stdb_logical_trios['log_dist_1']) + list(stdb_logical_trios['log_dist_2']) + list(stdb_logical_trios['log_dist_3'])
log_dist_for_3 = pd.Series(log_dist_for_3)
stdb_trio_percent_under_same_transformer = len(log_dist_for_3[log_dist_for_3 == 1])/len(log_dist_for_3)

stdb_pair_percent_under_same_transformer, stdb_trio_percent_under_same_transformer

# Plot 3: Outage Size v. time variance 

In [82]:
db_size_time = pd.DataFrame(db.groupby('cluster_size')['time_range'].apply(np.mean)).reset_index()
sns.lineplot(x='cluster_size', y='time_range', data=db_size_time)
plt.title('DBSCAN: Outage Size v. Average Time Range of Outage')

In [80]:
agglom_size_time = pd.DataFrame(agglom.groupby('cluster_size')['range'].apply(np.mean)).reset_index()
sns.lineplot(x='cluster_size', y='range', data=agglom_size_time)
plt.title('Agglomerative: Outage Size v. Average Time Range of Outage')

In [83]:
stdb_size_time = pd.DataFrame(stdb.groupby('cluster_size')['time_range'].apply(np.mean)).reset_index()
sns.lineplot(x='cluster_size', y='time_range', data=stdb_size_time)
plt.title('STDBSCAN: Outage Size v. Average Time Range of Outage')

# Plot 4: Percent in Covex Hull 

In [91]:
db_convex_hull = pd.DataFrame(db.groupby('cluster_size')['percent_pow_within_outage'].apply(np.mean)).reset_index()
sns.lineplot(x='cluster_size', y='percent_pow_within_outage', data=db_convex_hull)
plt.title('DBSCAN: Outage Size v. Average Percent Sensors within the Convex Hull of the Outage')

In [92]:
agglom_convex_hull = pd.DataFrame(agglom.groupby('cluster_size')['percent_pow_within_outage'].apply(np.mean)).reset_index()
sns.lineplot(x='cluster_size', y='percent_pow_within_outage', data=agglom_convex_hull)
plt.title('Agglomerative: Outage Size v. Average Percent Sensors within the Convex Hull of the Outage')

In [93]:
stdb_convex_hull = pd.DataFrame(stdb.groupby('cluster_size')['percent_pow_within_outage'].apply(np.mean)).reset_index()
sns.lineplot(x='cluster_size', y='percent_pow_within_outage', data=stdb_convex_hull)
plt.title('STDBSCAN: Outage Size v. Average Percent Sensors within the Convex Hull of the Outage')

# Plot 5: SAFI Calculations 

In [None]:
#SAIFI Calculations are currently being calculated for the entire time period. Make sure to split the data into July, Aug, Sept 

In [103]:
#DBSCAN 
db_SAIFI_num = sum(db['cluster_size'].values)
db_SAIFI_denom = len(pw['core_id'].unique())*(len(db))
db_SAIFI = db_SAIFI_num/db_SAIFI_denom
db_SAIFI

In [104]:
#Agglomerative 
agglom_SAIFI_num = sum(agglom['cluster_size'].values)
agglom_SAIFI_denom = len(pw['core_id'].unique())*(len(agglom))
agglom_SAIFI = agglom_SAIFI_num/agglom_SAIFI_denom
agglom_SAIFI

In [105]:
#STDBSCAN
stdb_SAIFI_num = sum(stdb['cluster_size'].values)
stdb_SAIFI_denom = len(pw['core_id'].unique())*(len(stdb))
stdb_SAIFI = stdb_SAIFI_num/stdb_SAIFI_denom
stdb_SAIFI