In [2]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import time 
import datetime
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics 
import re
import pylab
from scipy.stats import norm
import geopandas as gpd 
import shapely.geometry as geometry
import shapely.ops as ops 
from functools import partial 
import pyproj

In [3]:
#these are the transition points of the outages 
outages = pd.read_parquet('part-00000-3c7aa0ea-41c7-4705-bafc-5662f2051563-c000.gz.parquet')
outages.head()

In [4]:
#this is all of the data from July 2018 (not just outage transition points!) 
#only use this for SAIFI 
pw = pd.read_parquet('part-00000-602cb425-c6be-40be-8024-aeb92fcb4315-c000.gz.parquet').drop(['product_id', 'millis', 'last_plug_millis', 'last_unplug_millis'], axis=1)
pw.head()

In [5]:
#DBSCAN(time) on a month's worth of data 
month_out = outages
month_out['z'] = 0
X=month_out[['outage_time', 'z']]
out_cluster = StandardScaler().fit_transform(X)
db = DBSCAN(eps=0.0001, algorithm='ball_tree').fit(out_cluster)
labels = db.labels_
mo_noise = list(labels).count(-1)
mo_clusters = len(set(labels)) - (1 if -1 in labels else 0)
mo_noise, mo_clusters

In [6]:
month_out['labels'] =labels 
month_out.head()
month_clust = pd.DataFrame(month_out.groupby(['labels'])['outage_time'].apply(lambda x: x.tolist()))
month_clust['latitude'] = month_out.groupby(['labels'])['location_latitude'].apply(lambda x: x.tolist()).values
month_clust['longitude'] = month_out.groupby(['labels'])['location_longitude'].apply(lambda x: x.tolist()).values
month_clust = month_clust.iloc[1:]
month_clust.head()

In [7]:
def find_range(lst):
    return max(lst) - min(lst)

month_clust['time_range'] = (np.vectorize(find_range)(month_clust['outage_time']))
month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))
month_clust.head()

In [8]:
#adjust bin size as well as the x and y bounds of the graph 
#this is a histogram of the range of times for each cluster 
plt.hist(month_clust['time_range'])
plt.title('DBSCAN(time): A Distribution for the Range of Times in a Cluster')
plt.xlabel('Time Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [9]:
plt.hist(month_clust['lat_range'])
plt.title('DBSCAN(time): A Distribution for the Range of Latitude in a Cluster')
plt.xlabel('Latitude Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [10]:
plt.hist(month_clust['long_range'])
plt.title('DBSCAN(time): A Distribution for the Range of Longitude in a Cluster')
plt.xlabel('Longitude Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [11]:
#DBSCAN(time + location) on a month's worth of data 
#was going to try to scale it, but first understand why this doesn't work ! 
Y=month_out[['outage_time', 'location_latitude', 'location_longitude']]
out_cluster = StandardScaler().fit_transform(Y)
db = DBSCAN(eps=0.1, algorithm='ball_tree').fit(out_cluster)
labels = db.labels_
mo_noise = list(labels).count(-1)
mo_clusters = len(set(labels)) - (1 if -1 in labels else 0)
mo_noise, mo_clusters

In [12]:
#used for SAIDI and SAIFI calculations 
pw_df = pw[pw['time'] < datetime.datetime(2018, 7, 3)]
len(pw_df), len(outages), len(pw_df[pw_df['is_powered'] == False])

In [13]:
#OPTICS clustering only time 
days = outages[outages['outage_time'] <= min(outages['outage_time'])+172800]
days['zeros'] = 0
out_cluster = days[['zeros', 'outage_time']]
out_cluster = StandardScaler().fit_transform(out_cluster)
optics = OPTICS(max_eps=0.7, algorithm='ball_tree').fit(out_cluster)
labels = optics.labels_
n_noise = list(labels).count(-1)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise, n_clusters

In [14]:
days['labels'] = labels
day_a = days[days['labels'] == 0]
day_b = days[days['labels'] == 1]
day_c = days[days['labels'] == 2]
day_d = days[days['labels'] == 3]
day_e = days[days['labels'] == 4]
unlabeled = days[days['labels'] == -1]
plt.figure(figsize=(10,10))
plt.scatter(unlabeled['location_longitude'], unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_a['location_longitude'], day_a['location_latitude'], label='cluster 0')
plt.scatter(day_b['location_longitude'], day_b['location_latitude'], label='cluster 1')
plt.scatter(day_c['location_longitude'], day_c['location_latitude'], label='cluster 2')
plt.scatter(day_d['location_longitude'], day_d['location_latitude'], label='cluster 3')
plt.scatter(day_e['location_longitude'], day_e['location_latitude'], label='cluster 4')
plt.title('Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
left, right = plt.xlim()
top, bottom = plt.ylim()

In [15]:
time_a = day_a['outage_time'].values[0]
for i in day_a['outage_time'].values[1:]: 
    print((time_a - i))

In [16]:
time_b = day_b['outage_time'].values[0]
for i in day_b['outage_time'].values[1:]: 
    print((time_b - i))

In [17]:
time_c = day_c['outage_time'].values[0]
for i in day_c['outage_time'].values[1:]: 
    print((time_c - i))

In [18]:
optics_SAIFI_num = len(days[days['labels'] != -1])
optics_SAIFI_denom = len(pw_df['core_id'].unique())*(days['labels'].nunique()-1)
optics_SAIFI = optics_SAIFI_num/optics_SAIFI_denom
optics_SAIFI

In [19]:
#DBSCAN clustering time + location 
out_df = outages[['location_latitude', 'location_longitude', 'outage_time']]
day = out_df[out_df['outage_time'] <= min(out_df['outage_time'])+172800]
out_cluster = StandardScaler().fit_transform(day)
out_cluster = pd.DataFrame(out_cluster, columns=['outage_time', 'location_latitude', 'location_longitude'])
out_cluster['outage_time'] = out_cluster['outage_time']*100
recluster = out_cluster 
db = DBSCAN(eps=6, algorithm='ball_tree').fit(recluster)
labels = db.labels_
n_noise_ = list(labels).count(-1)
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_, n_clusters_ 

In [20]:
day['labels'] = labels
day_0 = day[day['labels'] == 0]
day_1 = day[day['labels'] == 1]
day_2 = day[day['labels'] == 2]
day_3 = day[day['labels'] == 3]
day_4 = day[day['labels'] == 4]
day_unlabeled = day[day['labels'] == -1]
labels

In [21]:
plt.figure(figsize=(10,10))
plt.scatter(day_unlabeled['location_longitude'], day_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_0['location_longitude'], day_0['location_latitude'], label='cluster 0')
plt.scatter(day_1['location_longitude'], day_1['location_latitude'], label='cluster 1')
plt.scatter(day_2['location_longitude'], day_2['location_latitude'], label='cluster 2')
plt.scatter(day_3['location_longitude'], day_3['location_latitude'], label='cluster 3')
plt.scatter(day_4['location_longitude'], day_4['location_latitude'], label='cluster 4')
plt.title('Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
left, right = plt.xlim()
top, bottom = plt.ylim()

In [22]:
#now let's compare that with all the points in cluster 0
time0 = day[day['labels'] == 0]['outage_time'].values[0]
for i in day[day['labels'] == 0]['outage_time'].values[1:]: 
    print((time0 - i))

In [23]:
#now let's compare that with all the points in cluster 1
time1 = day[day['labels'] == 1]['outage_time'].values[0]
for i in day[day['labels'] == 1]['outage_time'].values[1:]: 
    print((time1 - i))

In [24]:
#now let's compare that with all the points in cluster 2
time2 = day[day['labels'] == 2]['outage_time'].values[0]
for i in day[day['labels'] == 2]['outage_time'].values[1:]: 
    print((time2 - i))

In [25]:
#now let's compare that with all the points in cluster 3
time3 = day[day['labels'] == 3]['outage_time'].values[0]
for i in day[day['labels'] == 3]['outage_time'].values[1:]: 
    print((time3 - i))

In [26]:
noise = day[day['labels'] == -1]
count = len(day[day['labels'] == -1]['outage_time'].values[1:])
noise_dist = [0]
for i in day[day['labels'] == -1]['outage_time'].values[1:]: 
    noise_dist.append(day[day['labels'] == -1]['outage_time'].values[0] - i)
noise['noise_dist'] = noise_dist
noise_dist

In [27]:
noise

In [31]:
#now let's plot noise points that have very little distance in time 
ex = noise[noise['noise_dist'] == 105081]
plt.scatter(ex['location_longitude'], ex['location_latitude'])
plt.xlim((left, right))
plt.ylim((top, bottom))

In [32]:
db_clusters = day.groupby('labels').mean()
db_clusters['location_latitude'] = day.groupby('labels')['location_latitude'].apply(lambda x: x.to_list())
db_clusters['location_longitude'] = day.groupby('labels')['location_longitude'].apply(lambda x: x.to_list())
db_clusters['outage_times'] = day.groupby('labels')['outage_time'].apply(lambda x: x.to_list())
db_clusters['stddev'] = db_clusters['outage_times'].apply(lambda x: np.std(x))
db_clusters
#now eliminate the first row and try to plot norm.pdf 

In [33]:
# sns.distplot(db_clusters['outage_times'].values[3], label='cluster_2')
sns.distplot(db_clusters['outage_times'].values[2], label='cluster_1')
sns.distplot(db_clusters['outage_times'].values[1], label='cluster_0')
plt.legend()
plt.title('DBSCAN cluster distributions')

In [34]:
spark_outages = pd.read_parquet('part-00000-1a77f616-ace0-482c-9ad1-bdc53a8286bc-c000.gz.parquet')
spark_outages.head()
 

In [36]:
#reading outages from the pw_finalized_with_string dataframe from outage_aggregator.py and doing some data cleaning 
spark_outages = spark_outages[spark_outages['cluster_size'] > 1]
spark_day = spark_outages
spark_day['outage_times'] = spark_day['outage_times'].apply(lambda x: re.findall('\d+', x))
spark_day['location'] = spark_day['location'].apply(lambda x: re.findall('\d.\d+', x))
spark_day_exploded = spark_day.explode('outage_times')
spark_day_exploded['outage_times'] = spark_day_exploded['outage_times'].apply(lambda x: int(x))
unexploded = spark_day_exploded.groupby('outage_time')['outage_times'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')
unexploded['location'] = spark_day.sort_values('outage_time')['location'].values

In [48]:
#more data cleaning 
explode_loc = unexploded.explode('location')
explode_loc['location'] = explode_loc['location'].apply(lambda x: float(x))
lat = explode_loc[explode_loc['location'] > 1]
long = explode_loc[explode_loc['location'] < 1]
lat['latitude'] = lat['location']
long['longitude'] = long['location']*(-1)
lat = lat.groupby('outage_time')['latitude'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')
lat['longitude'] = long.groupby('outage_time')['longitude'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')['longitude']
lat['cluster_size'] = spark_day.sort_values('outage_time')['cluster_size'].values
lat['outage_times_stddev'] = spark_day.sort_values('outage_time')['outage_times_stddev'].values
lat['range'] = spark_day.sort_values('outage_time')['outage_times_range'].values
lat['outage_times'] = unexploded.sort_values('outage_time')['outage_times']
spark_day = lat
spark_day['min_time'] = spark_day['outage_times'].apply(lambda x: min(x))
spark_day['max_time'] = spark_day['outage_times'].apply(lambda x: max(x))
spark_day

In [45]:
agglom_SAIFI_num = sum(spark_day['cluster_size'].values)
agglom_SAIFI_denom = len(pw_df['core_id'].unique())*(len(spark_day))
agglom_SAIFI = agglom_SAIFI_num/agglom_SAIFI_denom
agglom_SAIFI

In [34]:
sns.scatterplot(x='outage_time', y='cluster_size', data=spark_day_exploded)
plt.title('Outage time vs. Cluster Size for Agglomerative Clusters')

In [35]:
sns.distplot(spark_day['outage_times'].values[6], label='cluster_6')
sns.distplot(spark_day['outage_times'].values[5], label='cluster_5')
sns.distplot(spark_day['outage_times'].values[4], label='cluster_4')
sns.distplot(spark_day['outage_times'].values[3], label='cluster_3')
sns.distplot(spark_day['outage_times'].values[2], label='cluster_2')
sns.distplot(spark_day['outage_times'].values[1], label='cluster_1')
sns.distplot([1530449581, 1530449581, 1530449582], label='cluster_0')
plt.legend()
plt.title('Agglomerative Time Clustering Distributions')
plt.xlabel('outage time distribution (sec)')
plt.ylabel('relative cluster size')

In [36]:
#sns.distplot(db_clusters['outage_times'].values[3], label='cluster_2')
sns.distplot(db_clusters['outage_times'].values[2], label='cluster_1')
sns.distplot(db_clusters['outage_times'].values[1], label='cluster_0')
plt.legend()
plt.title('DBSCAN cluster distributions')
plt.xlabel('outage time distribution (sec)')
plt.ylabel('relative cluster size')

In [37]:
plt.figure(figsize=(10,10))
plt.scatter(spark_day['longitude'].values[1], spark_day['latitude'].values[1], label='cluster 1')
plt.scatter(spark_day['longitude'].values[2], spark_day['latitude'].values[2], label='cluster 2')
plt.scatter(spark_day['longitude'].values[3], spark_day['latitude'].values[3], label='cluster 3')
plt.scatter(spark_day['longitude'].values[4], spark_day['latitude'].values[4], label='cluster 4')
plt.scatter(spark_day['longitude'].values[5], spark_day['latitude'].values[5], label='cluster 5')
plt.scatter(spark_day['longitude'].values[6], spark_day['latitude'].values[6], label='cluster 6')
plt.title('Agglomorative Time Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.xlim(left, right)
plt.ylim(top, bottom)

In [19]:
plt.figure(figsize=(10,10))
# plt.scatter(day_unlabeled['location_longitude'], day_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_0['location_longitude'], day_0['location_latitude'], label='cluster 0')
plt.scatter(day_1['location_longitude'], day_1['location_latitude'], label='cluster 1')
plt.scatter(day_2['location_longitude'], day_2['location_latitude'], label='cluster 2')
#plt.scatter(day_3['location_longitude'], day_3['location_latitude'], label='cluster 3')
#plt.scatter(day_4['location_longitude'], day_4['location_latitude'], label='cluster 4')
plt.title('DBSCAN Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.xlim((left, right))
plt.ylim((top, bottom))

In [38]:
plt.hist(spark_outages['outage_times_range'], bins=30)
plt.title('Agglomerative: A Distribution for the Range of Times in a Cluster')
plt.xlabel('Time Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [51]:
#Now let's compute the number of sensors within the convex hull of the outage 
#we will start by computing the lat and long values for powered sensors that reported within the time range of the outage
#then, later we will compare these powered coords to see if they are also within the convex hull of the outage 
powered = pw[~pw['is_powered']]
powered['time'] = powered['time'].apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

within_time_lat = [[]]*len(spark_day)
within_time_long = [[]]*len(spark_day)
for p_index in range(len(powered)): 
    for o_index in range(len(spark_day)): 
        if powered['time'].values[p_index] >= spark_day['min_time'].values[o_index] and powered['time'].values[p_index] <= spark_day['max_time'].values[o_index]:
            within_time_lat[o_index].append(powered['location_latitude'].values[p_index])
            within_time_long[o_index].append(powered['location_longitude'].values[p_index])
            
spark_day['within_time_lat'] = within_time_lat
spark_day['within_time_long'] = within_time_long
spark_day.head()

In [61]:
#for clusters of 2, duplicate the lat and long points so that these points can also be converted into a Polygon by geopandas 
update_lat = spark_day['latitude'].copy()
update_long = spark_day['longitude'].copy()
for i in range(len(spark_day)): 
    if spark_day.iloc[i]['cluster_size'] < 3: 
        update_lat.values[i] = spark_day.iloc[i]['latitude']*2
        update_long.values[i] = spark_day.iloc[i]['longitude']*2

spark_day['latitude'] = update_lat
spark_day['longitude'] = update_long
spark_day.head()

In [63]:
#create geodataframes to calculate convex hull 
power = spark_day.copy()
out = spark_day.copy()
powered_poly = []
outage_poly = []
for i in range(len(spark_day)):
    a = spark_day.iloc[i, :]['within_time_long']
    b = spark_day.iloc[i, :]['within_time_lat']
    c = spark_day.iloc[i, :]['longitude']
    d = spark_day.iloc[i, :]['latitude']
    powered_poly.append(list(zip(a, b)))
    outage_poly.append(list(zip(c, d)))
    
def unique_coords(coords):
    return pd.Series(coords).unique()

power['powered_poly'] = powered_poly
out['powered_poly'] = powered_poly
spark_day['powered_poly'] = powered_poly
out['outage_poly'] = outage_poly
power['outage_poly'] = outage_poly
spark_day['outage_poly'] = outage_poly
crs = {'init', 'epsg:4326'}

powered_poly = [geometry.Polygon(x, holes=None) for x in power['powered_poly']]
power = gpd.GeoDataFrame(power, crs=crs, geometry=(powered_poly))

outage_poly = [geometry.Polygon(x, holes=None) for x in out['outage_poly']]
out= gpd.GeoDataFrame(out, crs=crs, geometry=(outage_poly))


power['powered_poly'] = (np.vectorize(unique_coords)(power['powered_poly']))
out['powered_poly'] = (np.vectorize(unique_coords)(out['powered_poly']))
spark_day['powered_poly'] = (np.vectorize(unique_coords)(spark_day['powered_poly']))
out['outage_poly'] = (np.vectorize(unique_coords)(out['outage_poly']))
power['outage_poly'] = (np.vectorize(unique_coords)(power['outage_poly']))
spark_day['outage_poly'] = (np.vectorize(unique_coords)(spark_day['outage_poly']))

power['convex_area_powered'] = power.convex_hull
out['convex_area_outage'] = out.convex_hull

out.head()

In [64]:
#calculate the convex hull 
def in_convex_hull(powered_coords, geom):
#takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
out['powered_within_outage'] = in_convex_hull
out

In [92]:
#plot size of outage vs. % out at within the convex hull 

def outage_size(outage_coords): 
    return len(pd.Series(outage_coords).unique())

out['powered_size_within_outage_area'] = (np.vectorize(outage_size)(out['powered_within_outage']))
out['percent_pow_within_outage'] = (out['powered_size_within_outage_area'] / (out['powered_size_within_outage_area'] + out['cluster_size']))*100

plt.figure(figsize=(10,10))
sns.scatterplot(x='cluster_size', y='percent_pow_within_outage', data=out)
plt.title('Agglomerative: Number of Sensors in Outage vs. Percent of Sensors Experiencing Outage Within the Convex Hull of the Outage')
plt.xlabel('Number of Sensors in an Outage')
plt.ylabel('Percentage of Sensors Powered within Convex Hull')

left, right = plt.xlim()
top, bottom = plt.ylim()
#pick these points out and plot them 

In [94]:
top, bottom

In [75]:
#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
t = spark_outages[spark_outages['cluster_size'] <= 3]
explode_loc = t.explode('location')
explode_loc['location'] = explode_loc['location'].apply(lambda x: float(x))
lat = explode_loc[explode_loc['location'] > 1]
long = explode_loc[explode_loc['location'] < 1]
lat['latitude'] = lat['location']
t = lat[['outage_time', 'outage_times', 'cluster_size', 'latitude']]
t['longitude'] = long['location']*(-1)

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for sensors of cluster size 2 
pairs = t[t['cluster_size'] == 2]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.longitude, pairs.latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('outage_time')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('outage_time')['geometry'].last(), crs={'init':'aea'})
distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#dustances for sensors of cluster size 3 
trios = t[t['cluster_size'] == 3]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.longitude, trios.latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('outage_time')['geometry'].last(), crs={'init':'aea'})

trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))
#only one of the columns had all trios within 550 m so I took the average to see that 6/10 are below an average distance of 550
trios_distances['avg'] = (trios_distances['2->3'] + trios_distances['1->2'] + trios_distances['3->1'])/3
trios_distances

In [84]:
dist_for_3 = list(trios_distances['1->2'].values) + list(trios_distances['2->3'].values) + list(trios_distances['3->1'].values)
plt.hist(dist_for_3, bins=20)
plt.axvline(x=550, label='550m cutoff', c='r')
plt.title('Agglomerative: Distances between sensors that are clustered as trios')
plt.legend()

In [85]:
plt.hist(trios_distances['avg'], bins=15)
plt.axvline(x=550, label='550m cutoff', c='r')
plt.title('Agglomerative: Average distance between sensors that are clustered as trios')
plt.legend()

In [88]:
plt.hist(distances)
plt.axvline(x=550, label='550m cutoff', c='r')
plt.title('Agglomerative: Distances between sensors that are clustered as pairs')
plt.legend()

In [89]:
#now let's caclulate the percentage to the left of the red line for each of the graphs:
#percentage under the cutoff for pairs: 
len(distances[distances < 550])/len(distances)

In [90]:
#percentage under the cutoff for trios: 
dist_for_3 = np.array(dist_for_3)
len(dist_for_3[dist_for_3 < 550])/len(dist_for_3)

In [91]:
#percentage under the cutoff for average trio distance: 
len(trios_distances[trios_distances['avg'] < 550]['avg'])/len(trios_distances)

In [99]:
#using DBSCAN only with time 
out_df = outages[['outage_time', 'location_latitude', 'location_longitude']]
day = out_df[out_df['outage_time'] <= min(out_df['outage_time'])+172800]
day['zeros'] = 0
out_cluster = StandardScaler().fit_transform(day[['outage_time', 'zeros']])
db = DBSCAN(eps=.07, algorithm='ball_tree').fit(out_cluster)
labels = db.labels_
n_noise_ = list(labels).count(-1)
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_, n_clusters_ 

In [100]:
day['labels'] = labels
day_0 = day[day['labels'] == 0]
day_1 = day[day['labels'] == 1]
day_2 = day[day['labels'] == 2]
day_3 = day[day['labels'] == 3]
day_4 = day[day['labels'] == 4]
day_unlabeled = day[day['labels'] == -1]
labels

In [101]:
plt.figure(figsize=(10,10))
plt.scatter(day_unlabeled['location_longitude'], day_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_0['location_longitude'], day_0['location_latitude'], label='cluster 0')
plt.scatter(day_1['location_longitude'], day_1['location_latitude'], label='cluster 1')
plt.scatter(day_2['location_longitude'], day_2['location_latitude'], label='cluster 2')
plt.scatter(day_3['location_longitude'], day_3['location_latitude'], label='cluster 3')
plt.scatter(day_4['location_longitude'], day_4['location_latitude'], label='cluster 4')
plt.title('Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.xlim((left, right))
plt.ylim((top, bottom))

In [102]:
#now let's compare that with all the points in cluster 0
time0 = day[day['labels'] == 0]['outage_time'].values[0]
for i in day[day['labels'] == 0]['outage_time'].values[1:]: 
    print((time0 - i))
time0

In [103]:
#now let's compare that with all the points in cluster 1
time1 = day[day['labels'] == 1]['outage_time'].values[0]
for i in day[day['labels'] == 1]['outage_time'].values[1:]: 
    print((time1 - i))
time1

In [104]:
#now let's compare that with all the points in cluster 2
time2 = day[day['labels'] == 2]['outage_time'].values[0]
for i in day[day['labels'] == 2]['outage_time'].values[1:]: 
    print((time2 - i))
time2

In [105]:
time3 = day[day['labels'] == 3]['outage_time'].values[0]
for i in day[day['labels'] == 3]['outage_time'].values[1:]: 
    print((time3 - i))
time3

In [106]:
time4 = day[day['labels'] == 4]['outage_time'].values[0]
for i in day[day['labels'] == 4]['outage_time'].values[1:]: 
    print((time4 - i))
time4

In [107]:
db_SAIFI_num = len(day[day['labels'] != -1])
db_SAIFI_denom = len(pw_df['core_id'].unique())*3
db_SAIFI = db_SAIFI_num/db_SAIFI_denom
db_SAIFI

In [108]:
day.head()

In [111]:
def find_range(lst):
    return max(lst) - min(lst)

#days['time'] = days['time'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))
days_clust = pd.DataFrame(day.groupby(['labels'])['outage_time'].apply(lambda x: x.tolist())).rename(columns={'outage_time': 'outage_times'})
days_clust['latitude'] = day.groupby(['labels'])['location_latitude'].apply(lambda x: x.tolist()).values
days_clust['longitude'] = day.groupby(['labels'])['location_longitude'].apply(lambda x: x.tolist()).values
days_clust = days_clust.iloc[1:]

days_clust['time_range'] = (np.vectorize(find_range)(days_clust['outage_times']))
days_clust['lat_range'] = (np.vectorize(find_range)(days_clust['latitude']))
days_clust['long_range'] = (np.vectorize(find_range)(days_clust['longitude']))

days_clust.head()