In [1]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import time 
import datetime
from datetime import datetime
from datetime import timezone
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn import metrics 
import re
import pylab
from scipy.stats import norm
import geopandas as gpd 
import shapely.geometry as geometry
import shapely.ops as ops 
from functools import partial 
import pyproj

In [2]:
#these are the transition points of the outages from July 2018
outages = pd.read_parquet('part-00000-3c7aa0ea-41c7-4705-bafc-5662f2051563-c000.gz.parquet')
outages.head()

In [3]:
#this is all of the data from July 2018 (not just outage transition points!) 
#only use this for SAIFI and convex hull calculations 
pw = pd.read_parquet('part-00000-602cb425-c6be-40be-8024-aeb92fcb4315-c000.gz.parquet').drop(['product_id', 'millis', 'last_plug_millis', 'last_unplug_millis'], axis=1)
pw.head()

In [4]:
#logical grid distance csv 
logical = pd.read_csv('/Users/emilypaszkiewicz17/gridwatch-data-analysis/grid_distance.csv')
logical = logical.reset_index()
logical

In [39]:
#DBSCAN(time) on a month's worth of data 
month_out = outages
month_out['z'] = 0
X=month_out[['outage_time', 'z']]
out_cluster = StandardScaler().fit_transform(X)
db = DBSCAN(eps=0.0001, algorithm='ball_tree', min_samples=2).fit(out_cluster)
labels = db.labels_
no_noise = list(labels).count(-1)
no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
no_noise, no_clusters

In [40]:
month_out['labels'] =labels 
month_out['core_id'] = outages['core_id']
month_out.head()
month_clust = pd.DataFrame(month_out.groupby(['labels'])['outage_time'].apply(lambda x: x.tolist()))
month_clust['latitude'] = month_out.groupby(['labels'])['location_latitude'].apply(lambda x: x.tolist()).values
month_clust['longitude'] = month_out.groupby(['labels'])['location_longitude'].apply(lambda x: x.tolist()).values
month_clust['ids'] = month_out.groupby(['labels'])['core_id'].apply(lambda x: x.tolist()).values
month_clust = month_clust.iloc[1:]

def find_range(lst):
    return max(lst) - min(lst)

month_clust.head()
month_clust['time_range'] = (np.vectorize(find_range)(month_clust['outage_time']))
month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))

month_clust['cluster_size'] = month_out.groupby(['labels'])['outage_time'].count()
month_clust['min_time'] = month_clust['outage_time'].apply(lambda x: min(x))
month_clust['max_time'] = month_clust['outage_time'].apply(lambda x: max(x))


month_clust.head()

In [41]:
#FYI this cell will take a few mins to run on a month's worth of data

#check if powered points are within the time range of the clusters 
#we will start by computing the lat and long values for powered sensors that reported within the time range of the outage
#then, later we will compare these powered coords to see if they are also within the convex hull of the outage
powered = pw[~pw['is_powered']]
powered['time'] = powered['time'].apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

within_time_lat = []
within_time_long = []
for o_index in range(len(month_clust)): 
    nest_lat = []
    nest_long= []
    for p_index in range(len(powered)): 
        if powered['time'].values[p_index] >= month_clust['min_time'].values[o_index] and powered['time'].values[p_index] <= month_clust['max_time'].values[o_index]:
            nest_lat.append(powered['location_latitude'].values[p_index])
            nest_long.append(powered['location_longitude'].values[p_index])        
    within_time_lat.append(nest_lat)
    within_time_long.append(nest_long)
            
month_clust['within_time_lat'] = within_time_lat
month_clust['within_time_long'] = within_time_long






#for clusters of 2, duplicate the lat and long points so that these points can also be converted into a Polygon by geopandas 
update_lat = month_clust['latitude'].copy()
update_long = month_clust['longitude'].copy()
update_within_lat = month_clust['within_time_lat'].copy()
update_within_long = month_clust['within_time_long'].copy()

for i in range(len(month_clust)): 
    if len(month_clust.iloc[i]['within_time_lat']) == 0: 
        update_within_lat.values[i] = [0, 1, 2]
        update_within_long.values[i] = [0, 1, 2] 
    if len(update_within_lat.values[i]) < 3: 
        update_within_lat.values[i] = month_clust.iloc[i]['within_time_lat']*3
        update_within_long.values[i] = month_clust.iloc[i]['within_time_long']*3        
    if month_clust.iloc[i]['cluster_size'] < 3: 
        update_lat.values[i] = month_clust.iloc[i]['latitude']*2
        update_long.values[i] = month_clust.iloc[i]['longitude']*2

month_clust['latitude'] = update_lat
month_clust['longitude'] = update_long
month_clust['within_time_long'] = update_within_long
month_clust['within_time_lat'] = update_within_lat





#create geodataframes to calculate convex hull 
power = month_clust.copy()
out = month_clust.copy()
powered_poly = []
outage_poly = []
for i in range(len(month_clust)):
    a = month_clust.iloc[i, :]['within_time_long']
    b = month_clust.iloc[i, :]['within_time_lat']
    c = month_clust.iloc[i, :]['longitude']
    d = month_clust.iloc[i, :]['latitude']
    powered_poly.append(list(zip(a, b)))
    outage_poly.append(list(zip(c, d)))
    
def unique_coords(coords):
    return pd.Series(coords).unique()

power['powered_poly'] = powered_poly
out['powered_poly'] = powered_poly
month_clust['powered_poly'] = powered_poly
out['outage_poly'] = outage_poly
power['outage_poly'] = outage_poly
month_clust['outage_poly'] = outage_poly
crs = {'init', 'epsg:4326'}

powered_poly = [geometry.Polygon(x, holes=None) for x in power['powered_poly']]
power = gpd.GeoDataFrame(power, crs=crs, geometry=(powered_poly))

outage_poly = [geometry.Polygon(x, holes=None) for x in out['outage_poly']]
out= gpd.GeoDataFrame(out, crs=crs, geometry=(outage_poly))


power['powered_poly'] = (np.vectorize(unique_coords)(power['powered_poly']))
out['powered_poly'] = (np.vectorize(unique_coords)(out['powered_poly']))
month_clust['powered_poly'] = (np.vectorize(unique_coords)(month_clust['powered_poly']))
out['outage_poly'] = (np.vectorize(unique_coords)(out['outage_poly']))
power['outage_poly'] = (np.vectorize(unique_coords)(power['outage_poly']))
month_clust['outage_poly'] = (np.vectorize(unique_coords)(month_clust['outage_poly']))

power['convex_area_powered'] = power.convex_hull
out['convex_area_outage'] = out.convex_hull



#calculate the convex hull 
def in_convex_hull(powered_coords, geom):
#takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
out['powered_within_outage'] = in_convex_hull

db = out 

In [42]:
#DBSCAN
db_clust_sizes = pd.DataFrame(db.groupby('cluster_size')['ids'].nunique()).reset_index()
sns.barplot(x='cluster_size', y='ids', data=db_clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('Cluster Size vs. Number of Clusters of this Size')

In [50]:
#DBSCAN Euclidean 

#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
c = month_out.groupby('labels').count()
pair_index = c[c['time'] == 2].index

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for clusters of 2: 
pairs = month_out[month_out['labels'].isin(pair_index)]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.location_longitude, pairs.location_latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('labels')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('labels')['geometry'].last(), crs={'init':'aea'})
db_distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#distances for clusters of 3: 
trio_index = c[c['time'] == 3].index
trios = month_out[month_out['labels'].isin(trio_index)]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.location_longitude, trios.location_latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('labels')['geometry'].last(), crs={'init':'aea'})

db_trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
db_trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
db_trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))




#percentage under 550m for pairs: 
db_pair_percent_under_550= len(db_distances[db_distances < 550])/len(db_distances)

#percentage under 550m for trios: 
db_dist_for_3 = list(db_trios_distances['1->2'].values) + list(db_trios_distances['2->3'].values) + list(db_trios_distances['3->1'].values)
db_dist_for_3 = np.array(db_dist_for_3)
db_trio_percent_under_550 = len(db_dist_for_3[db_dist_for_3 < 550])/len(db_dist_for_3)

db_pair_percent_under_550, db_trio_percent_under_550

In [52]:
#DBSCAN Logical Grid 

two_ids = db[db['cluster_size'] == 2]
three_ids =  db[db['cluster_size'] == 3]
pair_logical_dist=[]
trio_logical_dist_1=[]
trio_logical_dist_2=[]
trio_logical_dist_3=[]
for i in range(len(two_ids)): 
    id_1 = two_ids['ids'].values[i][0]
    id_2 = two_ids['ids'].values[i][1]
    pair_logical_dist.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])

for i in range(len(three_ids)):
    id_1 = three_ids['ids'].values[i][0]
    id_2 = three_ids['ids'].values[i][1]
    id_3 = three_ids['ids'].values[i][2]
    trio_logical_dist_1.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])
    trio_logical_dist_2.append(logical[(logical['level_0'] == id_2) & (logical['level_1'] == id_3)]['logical_grid_distance'].values[0])
    trio_logical_dist_3.append(logical[(logical['level_0'] == id_3) & (logical['level_1'] == id_1)]['logical_grid_distance'].values[0])

    
two_ids['logical_distance']= pair_logical_dist
three_ids['log_dist_1'] = trio_logical_dist_1
three_ids['log_dist_2'] = trio_logical_dist_2
three_ids['log_dist_3'] = trio_logical_dist_3
db_logical_pairs = two_ids 
db_logical_trios = three_ids
two_ids


#calculate the % of outage pairs that are under the same transformer 
db_pair_percent_under_same_transformer = len(db_logical_pairs[db_logical_pairs['logical_distance'] ==1])/len(db_logical_pairs)

#calculate the % of outage trios that are under the same transformer 
log_dist_for_3 = list(db_logical_trios['log_dist_1']) + list(db_logical_trios['log_dist_2']) + list(db_logical_trios['log_dist_3'])
log_dist_for_3 = pd.Series(log_dist_for_3)
db_trio_percent_under_same_transformer = len(log_dist_for_3[log_dist_for_3 == 1])/len(log_dist_for_3)

db_pair_percent_under_same_transformer, db_trio_percent_under_same_transformer