# Figures for IPSN 

Structure of the notebook: 

1) Clustering and analysis for DB, Agglom, STDB which are ultimately stored in the dataframes: *db, agglom, stdb*

2) Plots that Josh and Noah requested


Disclaimers:
- I plotted each clustering method separately instead of all compiled on one graph 
- Agglom logical distance hasn't been computed because the agglomerative csv I pulled doesn't have core_id's and I didn't want to mess with it 
- be wary about the order in which you run cells 
    - (the code is super repetitive since I pulled from a bunch of different notebooks, but I did my best to make it user friendly)
- Plots 3 and 4 are not very smooth... hopefully it will improve with more data, and if not maybe Matt can help?
    
Let me know if you run into any problems! - Emily 

In [1]:
#pip install all of these libraries before beginning 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import time 
import datetime
from datetime import datetime
from datetime import timezone
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn import metrics 
import re
import pylab
from scipy.stats import norm
import geopandas as gpd 
import shapely.geometry as geometry
import shapely.ops as ops 
from functools import partial 
import pyproj
import pyarrow
from statistics import mode, mean
from pyspark.sql import SparkSession



# Helper Functions for all Clustering Algorithms

In [2]:
#spark = SparkSession.builder.getOrCreate()

In [3]:
#these are the transition points of the outages from July 2018 pulled from outage_aggregator.py
def read_transition_data():
    outages = pd.read_parquet('../analysis-figures/clustering-analysis/all_transitions.gz.parquet', engine='pyarrow')
    out2 = outages.groupby(['core_id','outage_time'])['powered_sensors'].apply(lambda x: x.tolist()).values
    outages = outages.groupby(['core_id','outage_time']).first().reset_index()
    outages['powered_sensors'] = out2
    return outages

In [4]:
#logical grid distance csv 
def read_logical_grid():
    logical = pd.read_csv('../grid_distance.csv')
    logical = logical.reset_index()
    return logical

In [5]:
# Exctract data from the dbscan result and do some bookkeeping on other columns
def prep_cluster_data(month_out, time_label):
    month_clust = pd.DataFrame(month_out.groupby(['labels'])[time_label].apply(lambda x: x.tolist()))
    month_clust['latitude'] = month_out.groupby(['labels'])['location_latitude'].apply(lambda x: x.tolist()).values
    month_clust['longitude'] = month_out.groupby(['labels'])['location_longitude'].apply(lambda x: x.tolist()).values
    month_clust['powered_list'] = month_out.groupby(['labels'])['powered_sensors'].apply(lambda x: x.tolist()).values
    month_clust['powered_list'] = month_clust['powered_list'].apply(lambda x: [item for sublist in x for item in sublist]).values
    month_clust['powered_list'] = month_clust['powered_list'].apply(lambda x: list({v['core_id']:v for v in x}.values())).values
    month_clust['num_powered'] = month_clust['powered_list'].apply(lambda x: len(x)).values
    month_clust['sensors_reporting'] = month_out.groupby(['labels'])['sensors_reporting'].apply(lambda x: x.tolist()).values
    month_clust['sensors_reporting'] = month_clust['sensors_reporting'].apply(lambda x: int(mean(x))).values 
    month_clust['ids'] = month_out.groupby(['labels'])['core_id'].apply(lambda x: x.tolist()).values
    if time_label == 'outage_time':
        other_time = 'restore_time'
    else:
        other_time = 'outage_time'
    month_clust[other_time] = month_out.groupby(['labels'])[other_time].apply(lambda x: x.tolist()).values
    month_clust = month_clust.iloc[1:]

    def find_range(lst):
        return max(lst) - min(lst)

    month_clust['time_range'] = (np.vectorize(find_range)(month_clust[time_label]))
    month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
    month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))

    month_clust['cluster_size'] = month_out.groupby(['labels'])[time_label].count()
    month_clust['min_time'] = month_clust[time_label].apply(lambda x: min(x))
    month_clust['max_time'] = month_clust[time_label].apply(lambda x: max(x))

    def remove_overlapping_ids(series):
        indexes_to_remove = []
        for index, sensor in enumerate(series[3]):
            if sensor['core_id'] in series[6]:
                indexes_to_remove.append(index)

        for i in sorted(indexes_to_remove, reverse=True):
            del series[3][i]

        return series[3]

    month_clust['powered_list'] = month_clust.apply(remove_overlapping_ids,axis=1).values
    month_clust['num_powered'] = month_clust['powered_list'].apply(lambda x: len(x)).values
    month_clust['within_time_lat'] = month_clust['powered_list'].apply(lambda x: list(v['location_latitude'] for v in x)).values
    month_clust['within_time_long'] = month_clust['powered_list'].apply(lambda x: list(v['location_longitude'] for v in x)).values
        
    return month_clust

In [6]:
def run_convex_hull(month_clust, time_label):

    # Prep the dataframe for CONVEX HULL Calculations
    #for clusters of 2, duplicate the lat and long points so that these points can also be converted into a Polygon by geopandas 
    update_lat = month_clust['latitude'].copy()
    update_long = month_clust['longitude'].copy()
    update_within_lat = month_clust['within_time_lat'].copy()
    update_within_long = month_clust['within_time_long'].copy()

    for i in range(len(month_clust)): 
        if len(month_clust.iloc[i]['within_time_lat']) == 0: 
            update_within_lat.values[i] = [0, 1, 2]
            update_within_long.values[i] = [0, 1, 2] 
        if len(update_within_lat.values[i]) < 3: 
            update_within_lat.values[i] = month_clust.iloc[i]['within_time_lat']*3
            update_within_long.values[i] = month_clust.iloc[i]['within_time_long']*3        
        if month_clust.iloc[i]['cluster_size'] < 3: 
            update_lat.values[i] = month_clust.iloc[i]['latitude']*2
            update_long.values[i] = month_clust.iloc[i]['longitude']*2

    month_clust['latitude'] = update_lat
    month_clust['longitude'] = update_long
    month_clust['within_time_long'] = update_within_long
    month_clust['within_time_lat'] = update_within_lat

    #create geodataframes to calculate convex hull 
    power = month_clust.copy()
    out = month_clust.copy()
    powered_poly = []
    outage_poly = []
    for i in range(len(month_clust)):
        a = month_clust.iloc[i, :]['within_time_long']
        b = month_clust.iloc[i, :]['within_time_lat']
        c = month_clust.iloc[i, :]['longitude']
        d = month_clust.iloc[i, :]['latitude']
        powered_poly.append(list(zip(a, b)))
        outage_poly.append(list(zip(c, d)))

    def unique_coords(coords):
        return pd.Series(coords).unique()

    power['powered_poly'] = powered_poly
    out['powered_poly'] = powered_poly
    month_clust['powered_poly'] = powered_poly
    out['outage_poly'] = outage_poly
    power['outage_poly'] = outage_poly
    month_clust['outage_poly'] = outage_poly
    crs = {'init', 'epsg:4326'}

    powered_poly = [geometry.Polygon(x, holes=None) for x in power['powered_poly']]
    power = gpd.GeoDataFrame(power, crs=crs, geometry=(powered_poly))

    outage_poly = [geometry.Polygon(x, holes=None) for x in out['outage_poly']]
    out= gpd.GeoDataFrame(out, crs=crs, geometry=(outage_poly))


    power['powered_poly'] = (np.vectorize(unique_coords)(power['powered_poly']))
    out['powered_poly'] = (np.vectorize(unique_coords)(out['powered_poly']))
    month_clust['powered_poly'] = (np.vectorize(unique_coords)(month_clust['powered_poly']))
    out['outage_poly'] = (np.vectorize(unique_coords)(out['outage_poly']))
    power['outage_poly'] = (np.vectorize(unique_coords)(power['outage_poly']))
    month_clust['outage_poly'] = (np.vectorize(unique_coords)(month_clust['outage_poly']))

    power['convex_area_powered'] = power.convex_hull
    out['convex_area_outage'] = out.convex_hull

    #calculate the convex hull 
    def in_convex_hull(powered_coords, geom):
    #takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
        in_convex_hull = []
        for i in powered_coords: 
            if geom.convex_hull.contains(geometry.Point(i)):
                in_convex_hull.append(i)
        in_convex_hull = pd.Series(in_convex_hull).unique() 
        return in_convex_hull

    in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
    out['powered_within_outage'] = in_convex_hull

    def outage_size(outage_coords): 
        return len(pd.Series(outage_coords).unique())

    out['powered_size_within_outage_area'] = (np.vectorize(outage_size)(out['powered_within_outage']))
    out['percent_pow_within_outage'] = (out['powered_size_within_outage_area'] / (out['powered_size_within_outage_area'] + out['cluster_size']))*100

    if time_label == 'outage_time':
        other_time = 'restore_time'
    else:
        other_time = 'outage_time'
    
    db = out[[time_label,'ids','sensors_reporting','cluster_size','powered_size_within_outage_area','percent_pow_within_outage', other_time]]
    return db

In [7]:
#SAIDI calculation functions 

#first read the lookup table for identifying restore labels 
lookup = pd.read_parquet('outage_lookup.gz.parquet')
lookup['test_time'] = lookup['time'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))

uplook = pd.DataFrame()
for i in lookup['core_id'].unique(): 
    relevant = lookup[lookup['core_id'] == i]
    relevant = relevant.sort_values('outage_time')
    relevant['trans_#'] = range(len(relevant))
    uplook = uplook.append(relevant)
lookup = uplook
lookup.head()


#ignoring restore clusters 
def ignore_restore(df):
    df['avg_out'] = df['outage_time'].apply(np.mean)
    df['avg_restore'] = df['restore_time'].apply(np.mean)
    df['std_out'] = df['outage_time'].apply(np.std)
    df['std_restore'] = df['restore_time'].apply(np.std)
    df['clust_saidi'] = (df['avg_restore'] - df['avg_out'])*df['cluster_size']
    no_rest_SAIDI = np.sum(db_out['clust_saidi'])/mean(db_out['sensors_reporting'])
    print(f'SAIDI when we ignore restore clustering: {no_rest_SAIDI}')
    return df


#no noise 
def restore_clusters_exploded(db_out, db_restore, lookup, out_or_restore, other):   
    r_times = pd.DataFrame(db_restore.explode('restore_time')['restore_time'])
    r_ids = pd.DataFrame(db_restore.explode('ids')['ids'])
    o_times = pd.DataFrame(db_out.explode('outage_time')['outage_time'])
    o_ids = pd.DataFrame(db_out.explode('ids')['ids'])
    restore = r_times.combine_first(r_ids).reset_index()
    out = o_times.combine_first(o_ids).reset_index()
    restore_labels = []
    lat = []
    long = []
    multi = 0 
    for i in range(len(out)):
        time, eyed = out.iloc[i][out_or_restore], out.iloc[i]['ids']
        lookedup = lookup[(lookup[out_or_restore] == time) & (lookup['core_id'] == eyed) & (lookup['is_powered'] == False)]
        lat.append(lookedup['location_latitude'].values[0])
        long.append(lookedup['location_longitude'].values[0])
        if len(lookedup) == 1: 
            rest_time = lookedup[other].values[0]
            row = restore[(restore[other] == rest_time) & (restore['ids'] == eyed)]
            if len(row) == 1:
                restore_labels.append(row['labels'].values[0])
            else: 
                restore_labels.append(-1)
        elif len(lookedup) > 1: 
            multi += 1 
            rest_time = lookedup[other].values[0]
            row = restore[(restore[other] == rest_time) & (restore['ids'] == eyed)]
            if len(row) == 1:
                restore_labels.append(row['labels'].values[0])
            else: 
                restore_labels.append(-1)
        else: 
            print('problem: outage time not in the lookup table')
    out['restore_time'] = db_out.explode('restore_time')['restore_time'].values
    out['restore_labels'] = restore_labels
    out['latitude'] = lat 
    out['longitude'] = long
    out['sensors_reporting'] = db_out.explode('outage_time')['sensors_reporting'].values
    return db_w_labels(out, 'restore_labels') 


def db_w_labels(df, o_r_labels):
    percentage = []
    noise = []
    out_size = []
    for i in df['labels'].unique(): 
        a_df = df[df['labels'] == i]
        per = [len(a_df[o_r_labels].unique())]*len(a_df)
        n = [(len(a_df[a_df[o_r_labels] == -1]))/(len(a_df))]*len(a_df)
        o = [len(a_df)]*len(a_df)
        percentage.append(per)
        noise.append(n)
        out_size.append(o)
    percentage = [item for sublist in percentage for item in sublist]
    noise = [item for sublist in noise for item in sublist]
    out_size = [item for sublist in out_size for item in sublist]
    df['restore_groups'] = percentage 
    df['%_noise'] = noise
    df['out_size'] = out_size
    rest_count = pd.DataFrame(df.groupby(o_r_labels).count()['labels'])
    df = df.join(rest_count, on=o_r_labels, rsuffix='_').rename(columns={'labels_' : 'rest_size'})
    avg_restore_time = pd.DataFrame(df.groupby('restore_labels')['restore_time'].apply(np.mean))
    df = df.join(avg_restore_time, on='restore_labels', rsuffix='_').rename(columns={'restore_time_' : 'avg_rest_time'})
    return n_noise_saidi(df)


def avg_time_std_dur(df):
    mean_outage = db_out_clusters.groupby('labels')['outage_time'].apply(mean)
    mean_restore = db_out_clusters.groupby('labels')['restore_time'].apply(mean)
    std_outage = db_out_clusters.groupby('labels')['outage_time'].apply(np.std)
    std_restore = db_out_clusters.groupby('labels')['restore_time'].apply(np.std)
    num_sens = df.groupby('labels').count()['ids']
    
    df = df.join(mean_outage, on='labels', rsuffix='_').rename(columns={'outage_time_': 'mean_outage_time'})
    df = df.join(mean_restore, on='labels', rsuffix='_').rename(columns={'restore_time_': 'mean_restore_time'})
    df = df.join(std_outage, on='labels', rsuffix='__').rename(columns={'outage_time__': 'outage_time_stdev'})
    df = df.join(std_restore, on='labels', rsuffix='__').rename(columns={'restore_time__': 'restore_time_stdev'})
    df = df.join(num_sens, on='labels', rsuffix='__').rename(columns={'ids__': 'num_sens_out'})
    df['clust_saidi'] = (df['mean_restore_time'] - df['mean_outage_time'])*df['num_sens_out']
    return df 


def n_noise_saidi(df):
    no_noise_duration = avg_time_std_dur(df[df['restore_labels'] != -1])
    noise_duration = avg_time_std_dur(df[df['restore_labels'] == -1])
    no_noise_SAIDI = np.sum(no_noise_duration.groupby('labels').mean()['clust_saidi'])/ np.mean(no_noise_duration.groupby('labels').mean()['sensors_reporting'])
    noise_SAIDI = np.sum(noise_duration.groupby('labels').mean()['clust_saidi'])/ np.mean(noise_duration.groupby('labels').mean()['sensors_reporting'])
    print(f'SAIDI when we eliminate all noise: {noise_SAIDI}')
    print(f'SAIDI when we only include noise points: {no_noise_SAIDI}')
    return df


#noise clustered with nearest time 
def closest_time(df):
    noise = df[df['restore_labels'] == -1]
    nearest = []
    for i in noise['restore_time'].values:
        nearest_label = pd.DataFrame(abs(avg_restore_time['restore_time'] - i)).sort_values('restore_time').index[0]
        nearest.append(nearest_label)
    noise['nearest_time_labels'] = nearest
    noise = pd.DataFrame(noise['nearest_time_labels'])
    updated_df = df.join(noise)
    updated_df['restore_labels'] = updated_df['nearest_time_labels'].combine_first(df['restore_labels'])
    avg_restore_times = pd.DataFrame(updated_df.groupby('restore_labels')['restore_time'].apply(np.mean))
    closest_time_noise = updated_df.join(avg_restore_times, on='restore_labels', rsuffix='_').rename(columns={'restore_time_' : 'avg_rest_times'})
    return avg_time_std_dur_noise(closest_time_noise)


def avg_time_std_dur_noise(df):
    mean_outage = df.groupby('labels')['outage_time'].apply(mean)
    mean_restore = df.groupby('labels')['avg_rest_times'].apply(mean)
    std_outage = df.groupby('labels')['outage_time'].apply(np.std)
    std_restore = df.groupby('labels')['avg_rest_times'].apply(np.std)
    
    df = df.join(mean_outage, on='labels', rsuffix='_').rename(columns={'outage_time_': 'mean_outage_time'})
    df = df.join(mean_restore, on='labels', rsuffix='_').rename(columns={'avg_rest_times_': 'mean_restore_time'})
    df = df.join(std_outage, on='labels', rsuffix='__').rename(columns={'outage_time__': 'outage_time_stdev'})
    df = df.join(std_restore, on='labels', rsuffix='__').rename(columns={'avg_rest_times__': 'restore_time_stdev'})
    df['clust_saidi'] = (df['mean_restore_time'] - df['mean_outage_time'])*df['out_size']
    time_noise_SAIDI = np.sum(time_noise_duration.groupby('labels').mean()['clust_saidi'])/ np.mean(time_noise_duration.groupby('labels').mean()['sensors_reporting'])
    print(f'SAIDI when noise restores are clustered with the nearest restore in time: {time_noise_SAIDI}')
    return df 


#noise clustered with nearest distance in space 

def closest_dist(df):
    noise = df[df['restore_labels'] == -1]
    nearest = []
    for n in range(len(noise)):
        lat = noise.iloc[n]['latitude']
        long = noise.iloc[n]['longitude']
        a_time = noise.iloc[n]['restore_time']
        calc_dist = pd.DataFrame(np.sqrt((df['latitude'] - lat)**2 + (df['longitude'] - long)**2)) 
        calc_dist['labels'] = db_out_clusters['labels']
        nearest_labels = calc_dist[calc_dist[0] == calc_dist.sort_values(0)[0].values]['labels']        
        nearest_dist_times = avg_restore_time[avg_restore_time.index.isin(nearest_labels)]
        nearest_label = pd.DataFrame(abs(nearest_dist_times['restore_time'] - a_time)).sort_values('restore_time').index[0]
        nearest.append(nearest_label)
    noise['nearest_dist_labels'] = nearest
    noise = pd.DataFrame(noise['nearest_dist_labels'])
    updated_df = df.join(noise)
    updated_df['restore_labels'] = updated_df['nearest_dist_labels'].combine_first(df['restore_labels'])
    avg_restore_times = pd.DataFrame(updated_df.groupby('restore_labels')['restore_time'].apply(np.mean))
    updated_df = updated_df.join(avg_restore_times, on='restore_labels', rsuffix='_').rename(columns={'restore_time_' : 'avg_rest_times'})
    dist_noise_duration = avg_time_std_dur_noise(updated_df)
    dist_noise_SAIDI = np.sum(dist_noise_duration.groupby('labels').mean()['clust_saidi'])/ np.mean(time_noise_duration.groupby('labels').mean()['sensors_reporting'])
    #note: this also prints out the nearest time SAIDI too 
    print(f'SAIDI when we place noise in the restore cluster with the closest distance and time:{dist_noise_SAIDI}')
    return updated_df


# DBSCAN(time only): Clustering and Analysis 

In [8]:
#DBSCAN(time) on a month's worth of data 
def run_dbscan(outages, time_label):
    month_out = outages
    month_out['z'] = 0
    
    # Prep the DBSCAN Columns - no need to normalization, eps in seconds
    X=month_out[[time_label, 'z']]
    out_cluster = X.values
    
    # Actually run dbscan
    db = DBSCAN(eps=90, algorithm='ball_tree', min_samples=2).fit(out_cluster)
    labels = db.labels_
    no_noise = list(labels).count(-1)
    no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    
    month_out['labels'] = labels
    month_out['core_id'] = outages['core_id']
    
    return month_out


In [9]:
def cluster_db(time_label):
    data = read_transition_data()
    clustered_outages = run_dbscan(data, time_label)
    dbscan_formatted_clusters = prep_cluster_data(clustered_outages, time_label)
    dbscan_outages = run_convex_hull(dbscan_formatted_clusters, time_label)
    return dbscan_outages
    
db_out = cluster_db('outage_time')
db_restore = cluster_db('restore_time')
ignore_restore_clust = ignore_restore(db_out)
db_clusters = restore_clusters_exploded(db_out, db_restore, lookup, 'outage_time', 'restore_time')
avg_restore_time = pd.DataFrame(db_out_clusters.groupby('restore_labels')['restore_time'].apply(np.mean))
db_nearest_time = closest_time(db_out_clusters)
closest_dist_noise = closest_dist(db_out_clusters)
closest_dist_noise
db_nearest_time
db_clusters

In [11]:
restore_clusters_exploded(db_out, db_restore, lookup, 'outage_time', 'restore_time')

In [None]:
#percentage of noise (outage clusters that are not clustered in restore clusters)
len(db_out_clusters[db_out_clusters['restore_labels'] == -1])/len(db_out_clusters)

In [None]:
plt.figure(figsize=(10, 10))
db_out_clust_grouped = db_out_clusters.groupby('labels').first()
sns.scatterplot(x="out_size", y="%_noise", data=db_out_clust_grouped, label='Outage Clusters')
plt.title('Outage Cluster Size vs. Percent Noise in Restore Cluster')
plt.grid()
plt.legend()

In [None]:
plt.hist(db_out_clust_grouped['restore_groups'])
plt.title('The number of sub-groups restored in an outage')

In [None]:
#now let's find out the percentage of noise that is contained in restore clusters
rest_group_clust = db_out_clusters[db_out_clusters['restore_labels'] != -1]
rest_group_clust = rest_group_clust.groupby('restore_labels').first()
db_restores = db_restore.join(rest_group_clust['rest_size'], on='labels', rsuffix='_').fillna(0)
#rest_size here is the number of restores that are classified also as outages 
db_restores['%_noise'] = (1- db_restores['rest_size']/db_restores['cluster_size'])*100
np.mean(db_restores['%_noise'])
# this is the percentage of sensors that are in a restore cluster, but are not contained in an outage cluster ^ 

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(x="rest_size", y="%_noise", data=db_restores, label='Restore Clusters')
plt.title('Restore Cluster Size vs. Percent Noise in Restore Cluster')
plt.legend()

In [None]:
plt.hist(db_out_clust_grouped['%_noise']*100, label='outage_clusters', histtype='step')
plt.hist(db_restores['%_noise'], label='restore_clusters', histtype='step')
plt.title('Histogram of %_noise in a cluster')
plt.legend()

# Agglomerative: Cleaning and Analysis 

In [None]:
def read_agglomerative_cluster_data():
    #clusters = pd.read_parquet('../analysis-figures/clustering-analysis/agglomerative_clusters.gz.parquet',engine='pyarrow')
    clusters = spark.read.parquet('../analysis-figures/clustering-analysis/agglomerative_clusters.gz.parquet')
    return clusters.toPandas()

In [None]:
def agglomerative_cluster_prep(month_out):
    month_clust = month_out
    month_clust['latitude'] = month_clust['location'].apply(lambda x: list(l['location_latitude'] for l in x)).values
    month_clust['longitude'] = month_clust['location'].apply(lambda x: list(l['location_longitude'] for l in x)).values
    month_clust['ids'] = month_clust['core_id']
    month_clust['outage_time'] = month_clust['outage_times']
    
    def tuple_to_dict(power_list):
        l = []
        
        if power_list is None:
            return l
        
        for x in power_list:
            l.append({'core_id':x[0],'location_latitude':x[1],'location_longitude':x[2]})
        
        return l
    
    
    month_clust['powered_list'] = month_clust['powered_sensors'].apply(tuple_to_dict).values
    month_clust['num_powered'] = month_clust['powered_list'].apply(lambda x: len(x)).values
    month_clust = month_clust.iloc[1:]

    def find_range(lst):
        return max(lst) - min(lst)

    month_clust['time_range'] = (np.vectorize(find_range)(month_clust['outage_time']))
    month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
    month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))

    month_clust['min_time'] = month_clust['outage_time'].apply(lambda x: min(x))
    month_clust['max_time'] = month_clust['outage_time'].apply(lambda x: max(x))

    def remove_overlapping_ids(series):
        indexes_to_remove = []
        for index, sensor in enumerate(series[3]):
            if sensor['core_id'] in series[6]:
                indexes_to_remove.append(index)

        for i in sorted(indexes_to_remove, reverse=True):
            del series[3][i]

        return series[3]
    
    month_clust = month_clust[['outage_time','latitude','longitude','powered_list','num_powered','sensors_reporting','ids','time_range','lat_range','long_range','cluster_size','min_time','max_time']]

    month_clust['powered_list'] = month_clust.apply(remove_overlapping_ids,axis=1).values
    month_clust['num_powered'] = month_clust['powered_list'].apply(lambda x: len(x)).values
    month_clust['within_time_lat'] = month_clust['powered_list'].apply(lambda x: list(v['location_latitude'] for v in x)).values
    month_clust['within_time_long'] = month_clust['powered_list'].apply(lambda x: list(v['location_longitude'] for v in x)).values
    return month_clust

In [None]:
data = read_agglomerative_cluster_data()
formatted_clusters = agglomerative_cluster_prep(data)
agglomerative_outages = run_convex_hull(formatted_clusters)
agglomerative_outages.head()

# STDBSCAN: Clustering and Analysis 

In [None]:
#run this to initiate the STDBSCAN class 

class STDBSCAN(object):

    def __init__(self, col_lat, col_lon, col_time, spatial_threshold=500.0, 
                 temporal_threshold=60.0, min_neighbors=15):
        """
        Python st-dbscan implementation.
        :param col_lat: Latitude column name;
        :param col_lon:  Longitude column name;
        :param col_time: Date time column name;
        :param spatial_threshold: Maximum geographical coordinate (spatial)
             distance value (meters);
        :param temporal_threshold: Maximum non-spatial distance value (seconds);
        :param min_neighbors: Minimum number of points within Eps1 and Eps2
             distance;
        """
        self.col_lat = col_lat
        self.col_lon = col_lon
        self.col_time = col_time
        self.spatial_threshold = spatial_threshold
        self.temporal_threshold = temporal_threshold
        self.min_neighbors = min_neighbors

    def projection(self, df, p1_str='epsg:4326', p2_str='epsg:3395'):
        """
        Cython wrapper to converts from geographic (longitude,latitude)
        to native map projection (x,y) coordinates. It needs to select the
        right epsg. Values of x and y are given in meters
        """
        p1 = pyproj.Proj(init=p1_str)
        p2 = pyproj.Proj(init=p2_str)
        lon = df[self.col_lon].values
        lat = df[self.col_lat].values
        x1, y1 = p1(lon, lat)
        x2, y2 = pyproj.transform(p1, p2, x1, y1, radians=True)
        df[self.col_lon] = x2
        df[self.col_lat] = y2

        print(df)
        return df

    def _retrieve_neighbors(self, index_center, matrix):

        center_point = matrix[index_center, :]

        # filter by time
        min_time = center_point[2] - timedelta(seconds=self.temporal_threshold)
        max_time = center_point[2] + timedelta(seconds=self.temporal_threshold)
        matrix = matrix[(matrix[:, 2] >= min_time) &
                        (matrix[:, 2] <= max_time), :]
        # filter by distance
        tmp = (matrix[:, 0]-center_point[0])*(matrix[:, 0]-center_point[0]) + \
            (matrix[:, 1]-center_point[1])*(matrix[:, 1]-center_point[1])
        neigborhood = matrix[tmp <= (
            self.spatial_threshold*self.spatial_threshold), 4].tolist()
        neigborhood.remove(index_center)

        return neigborhood

    def run(self, df):
        """
        INPUTS:
            df={o1,o2,...,on} Set of objects;
        OUTPUT:
            C = {c1,c2,...,ck} Set of clusters
        """
        cluster_label = 0
        noise = -1
        unmarked = 777777
        stack = []

        # initial setup
        df = df[[self.col_lon, self.col_lat, self.col_time]]
        df = df.assign(cluster=unmarked)
        df['index'] = range(df.shape[0])
        matrix = df.values
        df.drop(['index'], inplace=True, axis=1)

        # for each point in database
        for index in range(matrix.shape[0]):
            if matrix[index, 3] == unmarked:
                neighborhood = self._retrieve_neighbors(index, matrix)

                if len(neighborhood) < self.min_neighbors:
                    matrix[index, 3] = noise
                else:  # found a core point
                    cluster_label += 1
                    # assign a label to core point
                    matrix[index, 3] = cluster_label

                    # assign core's label to its neighborhood
                    for neig_index in neighborhood:
                        matrix[neig_index, 3] = cluster_label
                        stack.append(neig_index)  # append neighbors to stack

                    # find new neighbors from core point neighborhood
                    while len(stack) > 0:
                        current_point_index = stack.pop()
                        new_neighborhood = \
                            self._retrieve_neighbors(current_point_index,
                                                     matrix)

                        # current_point is a new core
                        if len(new_neighborhood) >= self.min_neighbors:
                            for neig_index in new_neighborhood:
                                neig_cluster = matrix[neig_index, 3]
                                if any([neig_cluster == noise,
                                        neig_cluster == unmarked]):
                                    matrix[neig_index, 3] = cluster_label
                                    stack.append(neig_index)

        df['labels'] = matrix[:, 3]
        return df

In [None]:
#this is where you actually adjust the parameters 

def test_time(df):
    '''
    transfrom the lon and lat to x and y
    need to select the right epsg
    I don't the true epsg of sample, but get the same result by using 
    epsg:4326 and epsg:32635
    '''
    st_dbscan = STDBSCAN(col_lat='location_latitude', col_lon='location_longitude',
                         col_time='time', spatial_threshold=0.03,
                         temporal_threshold=90, min_neighbors=1)
    #df = st_dbscan.projection(df, p1_str='epsg:4326', p2_str='epsg:32630')
    return st_dbscan.run(df)


In [None]:
def run_stdbscan(outages):
    month_out = pd.DataFrame(test_time(outages))
    not_noise = month_out['labels'] != -1
    month_out = month_out[not_noise]
    month_out['core_id'] = outages['core_id']
    month_out['sensors_reporting'] = outages['sensors_reporting']
    month_out['powered_sensors'] = outages['powered_sensors']
    month_out['outage_time'] = month_out['time'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))
    return month_out
    

In [None]:
data = read_transition_data()
clustered_outages = run_stdbscan(data)
formatted_clusters = prep_cluster_data(clustered_outages)
stdbscan_outages = run_convex_hull(formatted_clusters)
stdbscan_outages.head()

# Plot 1: Trimodal Distribution 

In [141]:
#DBSCAN
db_clust_sizes = pd.DataFrame(dbscan_outages.groupby('cluster_size').count()).reset_index()
db_clust_sizes = db_clust_sizes[['cluster_size','ids']]
db_clust_sizes.to_csv('../analysis-figures/clustering-analysis/db_cluster_counts.csv')
sns.barplot(x='cluster_size', y='ids', data=db_clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('DBSCAN: Cluster Size vs. Number of Clusters of this Size')
db_clust_sizes.head()

In [53]:
#AGGLOM
agglom_clust_sizes = pd.DataFrame(agglomerative_outages.groupby('cluster_size').count()).reset_index()
agglom_clust_sizes = agglom_clust_sizes[['cluster_size','ids']]
agglom_clust_sizes.to_csv('../analysis-figures/clustering-analysis/agglom_cluster_counts.csv')
sns.barplot(x='cluster_size', y='ids', data=agglom_clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('Agglomerative: Cluster Size vs. Number of Clusters of this Size')

In [143]:
#STDBSCAN 
stdb_clust_sizes = pd.DataFrame(stdbscan_outages.groupby('cluster_size')['ids'].count()).reset_index()
stdb_clust_sizes = stdb_clust_sizes[['cluster_size','ids']]
stdb_clust_sizes.to_csv('../analysis-figures/clustering-analysis/stdb_cluster_counts.csv')
sns.barplot(x='cluster_size', y='ids', data=stdb_clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('STDBSCAN: Cluster Size vs. Number of Clusters of this Size')

# Plot 2a: Low Voltage Success (Euclidean)

In [39]:
def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='epsg:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='epsg:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

In [50]:
def pair_distances(outages):
    #Get only clusters of 2
    pair_index = outages['cluster_size'] == 2
    pairs = outages[pair_index]
    pairs['time'] = pairs['outage_time'].apply(lambda x: mean(x)).values
    
    def zip_locations(series):
        l = []
        for i in range(0,len(series[1])):
            l.append({'latitude':series[1][i],'longitude':series[2][i]})
            
        return l
    
    #Combine locations into a single array then explode
    pairs['location'] = pairs.apply(zip_locations,axis=1).values
    pairs = pairs.explode('location')
    pairs['latitude'] = pairs['location'].apply(lambda x: x['latitude']).values
    pairs['longitude'] = pairs['location'].apply(lambda x: x['longitude']).values
    pairs = pairs[0:5]
    print(pairs)
 
    gdf = gpd.GeoDataFrame(
        pairs, geometry=gpd.points_from_xy(pairs.latitude, pairs.longitude), crs={'init':'esri:4326'})
    dist_1 = gpd.GeoSeries(gdf.groupby('time')['geometry'].first(), crs={'init':'aea'})
    dist_2 = gpd.GeoSeries(gdf.groupby('time')['geometry'].last(), crs={'init':'aea'})
    db_distances = (np.vectorize(calc_dist)(dist_1, dist_2))
    return db_distances
    

In [51]:
pairs = pair_distances(dbscan_formatted_clusters)
pairs.head()

In [67]:


#distances for clusters of 3: 
trio_index = c[c['time'] == 3].index
trios = month_out[month_out['labels'].isin(trio_index)]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.location_longitude, trios.location_latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].last(), crs={'init':'aea'})

db_trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
db_trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
db_trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))




#percentage under 550m for pairs: 
db_pair_percent_under_550= len(db_distances[db_distances < 550])/len(db_distances)

#percentage under 550m for trios: 
db_dist_for_3 = list(db_trios_distances['1->2'].values) + list(db_trios_distances['2->3'].values) + list(db_trios_distances['3->1'].values)
db_dist_for_3 = np.array(db_dist_for_3)
db_trio_percent_under_550 = len(db_dist_for_3[db_dist_for_3 < 550])/len(db_dist_for_3)

db_pair_percent_under_550, db_trio_percent_under_550

In [None]:
#AGGLOMERATIVE EUCLIDEAN

#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
t = spark_outages[spark_outages['cluster_size'] <= 3]
explode_loc = t.explode('location')
explode_loc['location'] = explode_loc['location'].apply(lambda x: float(x))
lat = explode_loc[explode_loc['location'] > 1]
long = explode_loc[explode_loc['location'] < 1]
lat['latitude'] = lat['location']
t = lat[['outage_time', 'outage_times', 'cluster_size', 'latitude']]
t['longitude'] = long['location']*(-1)

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for sensors of cluster size 2 
pairs = t[t['cluster_size'] == 2]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.longitude, pairs.latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('outage_time')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('outage_time')['geometry'].last(), crs={'init':'aea'})
agglom_distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#dustances for sensors of cluster size 3 
trios = t[t['cluster_size'] == 3]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.longitude, trios.latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].last(), crs={'init':'aea'})

agglom_trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
agglom_trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
agglom_trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))
agglom_trios_distances

#I would use agglom_trios_distances_list for measuring clustering success 
agglom_dist_for_3 = list(agglom_trios_distances['1->2'].values) + list(agglom_trios_distances['2->3'].values) + list(agglom_trios_distances['3->1'].values)

#now let's caclulate the percentage within 550 m 
#percentage under the cutoff for pairs: 
agglom_pair_percent_under_550 = len(agglom_distances[agglom_distances < 550])/len(agglom_distances)

#percentage under the cutoff for trios: 
agglom_dist_for_3 = np.array(agglom_dist_for_3)
agglom_trio_percent_under_550 = len(agglom_dist_for_3[agglom_dist_for_3 < 550])/len(agglom_dist_for_3)

agglom_pair_percent_under_550, agglom_trio_percent_under_550

In [None]:
#STDBSCAN Euclidean 

#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
c = stdb_clustered.groupby('cluster').count()
pair_index = c[c['time'] == 2].index

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for clusters of 2: 
pairs = stdb_clustered[stdb_clustered['cluster'].isin(pair_index)]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.location_longitude, pairs.location_latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('cluster')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('cluster')['geometry'].last(), crs={'init':'aea'})
stdb_distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#distances for clusters of 3: 
trio_index = c[c['time'] == 3].index
trios = stdb_clustered[stdb_clustered['cluster'].isin(trio_index)]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.location_longitude, trios.location_latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].last(), crs={'init':'aea'})

stdb_trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
stdb_trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
stdb_trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))




#percentage under 550m for pairs: 
stdb_pair_percent_under_550= len(stdb_distances[stdb_distances < 550])/len(stdb_distances)

#percentage under 550m for trios: 
stdb_dist_for_3 = list(stdb_trios_distances['1->2'].values) + list(stdb_trios_distances['2->3'].values) + list(stdb_trios_distances['3->1'].values)
stdb_dist_for_3 = np.array(stdb_dist_for_3)
stdb_trio_percent_under_550 = len(stdb_dist_for_3[stdb_dist_for_3 < 550])/len(stdb_dist_for_3)

stdb_pair_percent_under_550, stdb_trio_percent_under_550

# Plot 2b: Low Voltage Success (Logical)

In [None]:
#DBSCAN Logical Grid 

two_ids = db[db['cluster_size'] == 2]
three_ids =  db[db['cluster_size'] == 3]
pair_logical_dist=[]
trio_logical_dist_1=[]
trio_logical_dist_2=[]
trio_logical_dist_3=[]
for i in range(len(two_ids)): 
    id_1 = two_ids['ids'].values[i][0]
    id_2 = two_ids['ids'].values[i][1]
    pair_logical_dist.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])

for i in range(len(three_ids)):
    id_1 = three_ids['ids'].values[i][0]
    id_2 = three_ids['ids'].values[i][1]
    id_3 = three_ids['ids'].values[i][2]
    trio_logical_dist_1.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])
    trio_logical_dist_2.append(logical[(logical['level_0'] == id_2) & (logical['level_1'] == id_3)]['logical_grid_distance'].values[0])
    trio_logical_dist_3.append(logical[(logical['level_0'] == id_3) & (logical['level_1'] == id_1)]['logical_grid_distance'].values[0])

    
two_ids['logical_distance']= pair_logical_dist
three_ids['log_dist_1'] = trio_logical_dist_1
three_ids['log_dist_2'] = trio_logical_dist_2
three_ids['log_dist_3'] = trio_logical_dist_3
db_logical_pairs = two_ids 
db_logical_trios = three_ids
two_ids


#calculate the % of outage pairs that are under the same transformer 
db_pair_percent_under_same_transformer = len(db_logical_pairs[db_logical_pairs['logical_distance'] ==1])/len(db_logical_pairs)

#calculate the % of outage trios that are under the same transformer 
log_dist_for_3 = list(db_logical_trios['log_dist_1']) + list(db_logical_trios['log_dist_2']) + list(db_logical_trios['log_dist_3'])
log_dist_for_3 = pd.Series(log_dist_for_3)
db_trio_percent_under_same_transformer = len(log_dist_for_3[log_dist_for_3 == 1])/len(log_dist_for_3)

db_pair_percent_under_same_transformer, db_trio_percent_under_same_transformer

In [None]:
#Agglom Logical Grid 

#we have to wait for agglom to have core_id's in order to calculate logical grid dist 
#this needs to happen either by merging dataframes or by adding them in spark
#once you have the core_id's you should be able to easily copy and paste the code for the DBSCANs

In [None]:
#STDBSCAN Logical Grid 

two_ids = stdb[stdb['cluster_size'] == 2]
three_ids =  stdb[stdb['cluster_size'] == 3]
pair_logical_dist=[]
trio_logical_dist_1=[]
trio_logical_dist_2=[]
trio_logical_dist_3=[]
for i in range(len(two_ids)): 
    id_1 = two_ids['ids'].values[i][0]
    id_2 = two_ids['ids'].values[i][1]
    pair_logical_dist.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])

for i in range(len(three_ids)):
    id_1 = three_ids['ids'].values[i][0]
    id_2 = three_ids['ids'].values[i][1]
    id_3 = three_ids['ids'].values[i][2]
    trio_logical_dist_1.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])
    trio_logical_dist_2.append(logical[(logical['level_0'] == id_2) & (logical['level_1'] == id_3)]['logical_grid_distance'].values[0])
    trio_logical_dist_3.append(logical[(logical['level_0'] == id_3) & (logical['level_1'] == id_1)]['logical_grid_distance'].values[0])

    
two_ids['logical_distance']= pair_logical_dist
three_ids['log_dist_1'] = trio_logical_dist_1
three_ids['log_dist_2'] = trio_logical_dist_2
three_ids['log_dist_3'] = trio_logical_dist_3
stdb_logical_pairs = two_ids 
stdb_logical_trios = three_ids


#calculate the % of outage pairs that are under the same transformer 
stdb_pair_percent_under_same_transformer = len(stdb_logical_pairs[stdb_logical_pairs['logical_distance'] ==1])/len(stdb_logical_pairs)

#calculate the % of outage trios that are under the same transformer 
log_dist_for_3 = list(stdb_logical_trios['log_dist_1']) + list(stdb_logical_trios['log_dist_2']) + list(stdb_logical_trios['log_dist_3'])
log_dist_for_3 = pd.Series(log_dist_for_3)
stdb_trio_percent_under_same_transformer = len(log_dist_for_3[log_dist_for_3 == 1])/len(log_dist_for_3)

stdb_pair_percent_under_same_transformer, stdb_trio_percent_under_same_transformer

# Plot 3: Outage Size v. time variance 

### DBSCAN

In [152]:
plt.figure(figsize=(10,10))
db['outage_time_stddev'] = dbscan_outages['outage_time'].apply(lambda x: np.std(x)).values
db_clust_stddev = db.groupby('cluster_size')['outage_time_stddev'].apply(np.mean).reset_index()
db_clust_stddev = db_clust_stddev[['cluster_size','outage_time_stddev']]
db_clust_stddev.to_csv('../analysis-figures/clustering-analysis/db_time_stddev.csv')
sns.lineplot(x='cluster_size', y='outage_time_stddev', data=db_clust_stddev, label='DBSCAN')

### AGGLOMERATIVE

In [54]:
agglomerative_outages['outage_time_stddev'] = agglomerative_outages['outage_time'].apply(lambda x: np.std(x)).values
agglom_clust_stddev = pd.DataFrame(agglomerative_outages.groupby('cluster_size')['outage_time_stddev'].apply(np.mean)).reset_index()
agglom_clust_stddev = agglom_clust_stddev[['cluster_size','outage_time_stddev']]
agglom_clust_stddev.to_csv('../analysis-figures/clustering-analysis/agglom_time_stddev.csv')
sns.lineplot(x='cluster_size', y='outage_time_stddev', data=agglom_clust_stddev, label='AGGLOMERATIVE')
plt.title('Outage Size v. Average Time Range of Outage')

### STDBSCAN

In [145]:
stdbscan_outages['outage_time_stddev'] = stdbscan_outages['outage_time'].apply(lambda x: np.std(x)).values
stdb_clust_stddev = pd.DataFrame(stdbscan_outages.groupby('cluster_size')['outage_time_stddev'].apply(np.mean)).reset_index()
stdb_clust_stddev = stdb_clust_stddev[['cluster_size','outage_time_stddev']]
stdb_clust_stddev.to_csv('../analysis-figures/clustering-analysis/stdb_time_stddev.csv')
sns.lineplot(x='cluster_size', y='outage_time_stddev', data=stdb_clust_stddev, label='STDBSCAN')
plt.title('Outage Size v. Average Time Range of Outage')

# Plot 4: Percent in Covex Hull 

In [150]:
plt.figure(figsize=(10,10))
db_convex_hull = dbscan_outages.groupby('cluster_size')['percent_pow_within_outage'].apply(np.mean).reset_index()
db_convex_hull.to_csv('../analysis-figures/clustering-analysis/db_convex_hull.csv')
sns.lineplot(x='cluster_size', y='percent_pow_within_outage', data=db_convex_hull, label='DBSCAN')
plt.title('Outage Size v. Average Percent Sensors within the Convex Hull of the Outage')

In [55]:
agglom_convex_hull = agglomerative_outages.groupby('cluster_size')['percent_pow_within_outage'].apply(np.mean).reset_index()
agglom_convex_hull.to_csv('../analysis-figures/clustering-analysis/agglom_convex_hull.csv')
sns.lineplot(x='cluster_size', y='percent_pow_within_outage', data=agglom_convex_hull, label='AGGLOMERATIVE')
plt.title('Outage Size v. Average Percent Sensors within the Convex Hull of the Outage')

In [151]:
stdb_convex_hull = stdbscan_outages.groupby('cluster_size')['percent_pow_within_outage'].apply(np.mean).reset_index()
stdb_convex_hull.to_csv('../analysis-figures/clustering-analysis/stdb_convex_hull.csv')
sns.lineplot(x='cluster_size', y='percent_pow_within_outage', data=stdb_convex_hull, label='STDBSCAN')
plt.title('Outage Size v. Average Percent Sensors within the Convex Hull of the Outage')

# Plot 5: SAFI Calculations 

In [None]:
#SAIFI Calculations are currently being calculated for the entire time period. Make sure to split the data into July, Aug, Sept 

In [None]:
#DBSCAN 
db_SAIFI_num = sum(db['cluster_size'].values)
db_SAIFI_denom = len(pw['core_id'].unique())*(len(db))
db_SAIFI = db_SAIFI_num/db_SAIFI_denom
db_SAIFI

In [None]:
#Agglomerative 
agglom_SAIFI_num = sum(agglom['cluster_size'].values)
agglom_SAIFI_denom = len(pw['core_id'].unique())*(len(agglom))
agglom_SAIFI = agglom_SAIFI_num/agglom_SAIFI_denom
agglom_SAIFI

In [None]:
#STDBSCAN
stdb_SAIFI_num = sum(stdb['cluster_size'].values)
stdb_SAIFI_denom = len(pw['core_id'].unique())*(len(stdb))
stdb_SAIFI = stdb_SAIFI_num/stdb_SAIFI_denom
stdb_SAIFI