In [1]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import time 
import datetime
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn import metrics 
import re
import pylab
from scipy.stats import norm

In [2]:
outages = pd.read_parquet('part-00000-3c7aa0ea-41c7-4705-bafc-5662f2051563-c000.gz.parquet')
outages.head()

In [3]:
out_df = outages[['location_latitude', 'location_longitude', 'outage_time']]
day = out_df[out_df['outage_time'] <= min(out_df['outage_time'])+172800]
day['zeros'] = 0
out_cluster = day[['outage_time', 'zeros']]
out_cluster = StandardScaler().fit_transform(out_cluster)
#out_cluster = pd.DataFrame(out_cluster, columns=['outage_time', 'location_latitude', 'location_longitude'])
#out_cluster['outage_time'] = out_cluster['outage_time']*7
#out_cluster['location_longitude'] = out_cluster['location_longitude']*2
db = DBSCAN(eps=.04, algorithm='ball_tree').fit(out_cluster)
labels = db.labels_
n_noise_ = list(labels).count(-1)
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_, n_clusters_ 

In [4]:
day['labels'] = labels
day_0 = day[day['labels'] == 0]
day_1 = day[day['labels'] == 1]
day_2 = day[day['labels'] == 2]
day_3 = day[day['labels'] == 3]
day_4 = day[day['labels'] == 4]
day_unlabeled = day[day['labels'] == -1]
labels

In [5]:
plt.figure(figsize=(10,10))
plt.scatter(day_unlabeled['location_longitude'], day_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_0['location_longitude'], day_0['location_latitude'], label='cluster 0')
plt.scatter(day_1['location_longitude'], day_1['location_latitude'], label='cluster 1')
plt.scatter(day_2['location_longitude'], day_2['location_latitude'], label='cluster 2')
plt.scatter(day_3['location_longitude'], day_3['location_latitude'], label='cluster 3')
plt.scatter(day_4['location_longitude'], day_4['location_latitude'], label='cluster 4')
plt.title('Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
left, right = plt.xlim()
top, bottom = plt.ylim()

In [6]:
#now let's compare that with all the points in cluster 0
time0 = day[day['labels'] == 0]['outage_time'].values[0]
for i in day[day['labels'] == 0]['outage_time'].values[1:]: 
    print((time0 - i))

In [7]:
#now let's compare that with all the points in cluster 1
time1 = day[day['labels'] == 1]['outage_time'].values[0]
for i in day[day['labels'] == 1]['outage_time'].values[1:]: 
    print((time1 - i))

In [8]:
#now let's compare that with all the points in cluster 2
time2 = day[day['labels'] == 2]['outage_time'].values[0]
for i in day[day['labels'] == 2]['outage_time'].values[1:]: 
    print((time2 - i))

In [9]:
#now let's compare that with all the points in cluster 3
time3 = day[day['labels'] == 3]['outage_time'].values[0]
for i in day[day['labels'] == 3]['outage_time'].values[1:]: 
    print((time3 - i))

In [10]:
noise = day[day['labels'] == -1]
count = len(day[day['labels'] == -1]['outage_time'].values[1:])
noise_dist = [0]
for i in day[day['labels'] == -1]['outage_time'].values[1:]: 
    noise_dist.append(day[day['labels'] == -1]['outage_time'].values[0] - i)
noise['noise_dist'] = noise_dist
noise_dist

In [11]:
#now let's plot noise points that have very little distance in time 
ex = noise[noise['noise_dist'] == 105081]
plt.scatter(ex['location_longitude'], ex['location_latitude'])
plt.xlim((left, right))
plt.ylim((top, bottom))

In [42]:
db_clusters = day.groupby('labels').mean().drop('zeros', axis=1)
db_clusters['location_latitude'] = day.groupby('labels')['location_latitude'].apply(lambda x: x.to_list())
db_clusters['location_longitude'] = day.groupby('labels')['location_longitude'].apply(lambda x: x.to_list())
db_clusters['outage_times'] = day.groupby('labels')['outage_time'].apply(lambda x: x.to_list())
db_clusters['stddev'] = db_clusters['outage_times'].apply(lambda x: np.std(x))
db_clusters
#now eliminate the first row and try to plot norm.pdf 

In [55]:
sns.distplot(db_clusters['outage_times'].values[3], label='cluster_2')
sns.distplot(db_clusters['outage_times'].values[2], label='cluster_1')
sns.distplot(db_clusters['outage_times'].values[1], label='cluster_0')
plt.legend()
plt.title('DBSCAN cluster distributions')

In [136]:
spark_outages = pd.read_parquet('part-00000-1a77f616-ace0-482c-9ad1-bdc53a8286bc-c000.gz.parquet')
spark_outages.head()
 

In [137]:
#reading outages from the pw_finalized_with_string dataframe from outage_aggregator.py and doing some data cleaning 
spark_outages = spark_outages[spark_outages['cluster_size'] > 1]
spark_day = spark_outages[spark_outages['outage_time'] <= min(out_df['outage_time'])+172800]
spark_day['outage_times'] = spark_day['outage_times'].apply(lambda x: re.findall('\d+', x))
spark_day['location'] = spark_day['location'].apply(lambda x: re.findall('\d.\d+', x))
spark_day_exploded = spark_day.explode('outage_times')
spark_day_exploded['outage_times'] = spark_day_exploded['outage_times'].apply(lambda x: int(x))
unexploded = spark_day_exploded.groupby('outage_time')['outage_times'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')
unexploded['location'] = spark_day.sort_values('outage_time')['location'].values

In [138]:

unexploded

In [139]:
#more data cleaning 
explode_loc = unexploded.explode('location')
explode_loc['location'] = explode_loc['location'].apply(lambda x: float(x))
lat = explode_loc[explode_loc['location'] > 1]
long = explode_loc[explode_loc['location'] < 1]
lat['latitude'] = lat['location']
long['longitude'] = long['location']*(-1)
lat = lat.groupby('outage_time')['latitude'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')
lat['longitude'] = long.groupby('outage_time')['longitude'].apply(lambda x: x.to_list()).reset_index().sort_values('outage_time')['longitude']
lat['cluster_size'] = spark_day.sort_values('outage_time')['cluster_size'].values
lat['outage_times_stddev'] = spark_day.sort_values('outage_time')['outage_times_stddev'].values
lat['outage_times'] = unexploded.sort_values('outage_time')['outage_times']
spark_outages = lat
spark_outages

In [140]:
sns.scatterplot(x='outage_time', y='cluster_size', data=spark_day_exploded)

In [150]:
sns.distplot(spark_outages['outage_times'].values[6], label='cluster_6')
sns.distplot(spark_outages['outage_times'].values[5], label='cluster_5')
sns.distplot(spark_outages['outage_times'].values[4], label='cluster_4')
sns.distplot(spark_outages['outage_times'].values[3], label='cluster_3')
sns.distplot(spark_outages['outage_times'].values[2], label='cluster_2')
sns.distplot(spark_outages['outage_times'].values[1], label='cluster_1')
# sns.distplot(unexploded['outage_times'].values[0], label='cluster_0')
plt.legend()
plt.title('Agglomerative Time Clustering Distributions')
plt.xlabel('outage time distribution (sec)')
plt.ylabel('relative cluster size')

In [151]:
sns.distplot(db_clusters['outage_times'].values[3], label='cluster_2')
sns.distplot(db_clusters['outage_times'].values[2], label='cluster_1')
sns.distplot(db_clusters['outage_times'].values[1], label='cluster_0')
plt.legend()
plt.title('DBSCAN cluster distributions')
plt.xlabel('outage time distribution (sec)')
plt.ylabel('relative cluster size')

In [157]:
plt.figure(figsize=(10,10))
# plt.scatter(day_unlabeled['location_longitude'], day_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(spark_outages['longitude'].values[0], spark_outages['latitude'].values[0], label='cluster 0')
plt.scatter(spark_outages['longitude'].values[1], spark_outages['latitude'].values[1], label='cluster 1')
plt.scatter(spark_outages['longitude'].values[2], spark_outages['latitude'].values[2], label='cluster 2')
plt.scatter(spark_outages['longitude'].values[3], spark_outages['latitude'].values[3], label='cluster 3')
plt.scatter(spark_outages['longitude'].values[4], spark_outages['latitude'].values[4], label='cluster 4')
plt.scatter(spark_outages['longitude'].values[5], spark_outages['latitude'].values[5], label='cluster 3')
plt.scatter(spark_outages['longitude'].values[6], spark_outages['latitude'].values[6], label='cluster 4')
plt.title('Agglomorative Time Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
left, right = plt.xlim()
top, bottom = plt.ylim()

In [158]:
plt.figure(figsize=(10,10))
# plt.scatter(day_unlabeled['location_longitude'], day_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_0['location_longitude'], day_0['location_latitude'], label='cluster 0')
plt.scatter(day_1['location_longitude'], day_1['location_latitude'], label='cluster 1')
plt.scatter(day_2['location_longitude'], day_2['location_latitude'], label='cluster 2')
#plt.scatter(day_3['location_longitude'], day_3['location_latitude'], label='cluster 3')
#plt.scatter(day_4['location_longitude'], day_4['location_latitude'], label='cluster 4')
plt.title('DBSCAN Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.xlim((left, right))
plt.ylim((top, bottom))

In [221]:
#using DBSCAN only with time 
out_df = outages[['outage_time', 'location_latitude', 'location_longitude']]
day = out_df[out_df['outage_time'] <= min(out_df['outage_time'])+172800]
day['zeros'] = 0
out_cluster = StandardScaler().fit_transform(day[['outage_time', 'zeros']])
db = DBSCAN(eps=.1, algorithm='ball_tree').fit(out_cluster)
labels = db.labels_
n_noise_ = list(labels).count(-1)
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_, n_clusters_ 

In [223]:
day['labels'] = labels
day_0 = day[day['labels'] == 0]
day_1 = day[day['labels'] == 1]
day_2 = day[day['labels'] == 2]
day_3 = day[day['labels'] == 3]
day_4 = day[day['labels'] == 4]
day_unlabeled = day[day['labels'] == -1]
labels

In [224]:
plt.figure(figsize=(10,10))
plt.scatter(day_unlabeled['location_longitude'], day_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_0['location_longitude'], day_0['location_latitude'], label='cluster 0')
plt.scatter(day_1['location_longitude'], day_1['location_latitude'], label='cluster 1')
plt.scatter(day_2['location_longitude'], day_2['location_latitude'], label='cluster 2')
plt.scatter(day_3['location_longitude'], day_3['location_latitude'], label='cluster 3')
plt.scatter(day_4['location_longitude'], day_4['location_latitude'], label='cluster 4')
plt.title('Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')

In [225]:
#now let's compare that with all the points in cluster 0
time0 = day[day['labels'] == 0]['outage_time'].values[0]
for i in day[day['labels'] == 0]['outage_time'].values[1:]: 
    print((time0 - i))
time0

In [226]:
#now let's compare that with all the points in cluster 1
time1 = day[day['labels'] == 1]['outage_time'].values[0]
for i in day[day['labels'] == 1]['outage_time'].values[1:]: 
    print((time1 - i))
time1

In [227]:
#now let's compare that with all the points in cluster 2
time2 = day[day['labels'] == 2]['outage_time'].values[0]
for i in day[day['labels'] == 2]['outage_time'].values[1:]: 
    print((time2 - i))
time2

In [2]:
pw = pd.read_parquet('part-00000-49a36603-9035-47f4-b73e-eae8d28aa10a-c000.gz.parquet')
pw.head()
outage = pw[pw['is_powered'] == True]
outage = outage[['time', 'location_latitude', 'location_longitude']]
outage.head()

In [3]:
jul_1 = outage[outage['time'] < datetime.datetime(2018, 7, 1, 0, 2)]
jul_1['time'] = jul_1['time'].apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp())
len(jul_1)

In [4]:
# expanded = pw.join(pd.DataFrame(pw_time['time'].values.tolist(), columns=['year','month', 'day', 'hour', 'min', 'sec', 'sec1', 'sec2', 'sec3'], index=pw_time.index))
# expanded = expanded[['powered_longitude', 'powered_latitude', 'outage_longitude', 'outage_latitude', 'year','month', 'day', 'hour', 'min', 'sec']]
# expanded

In [16]:
pw_cluster = StandardScaler().fit_transform(jul_1)
db = DBSCAN(eps=.65, algorithm='ball_tree').fit(pw_cluster)
labels = db.labels_
n_noise_ = list(labels).count(-1)
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_, n_clusters_ 


In [17]:
labels

In [18]:
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"
#       % metrics.adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information: %0.3f"
#       % metrics.adjusted_mutual_info_score(labels_true, labels,
#                                            average_method='arithmetic'))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(pw_cluster, labels))

In [19]:
#check with raw data of outages and times that they went out and then cluster based on outages. then see which ones seem to be the most reliably clustering 
jul_1['labels'] = labels
jul_1.head()

In [20]:
jul_1_0 = jul_1[jul_1['labels'] == 0]
jul_1_1 = jul_1[jul_1['labels'] == 1]
jul_1_2 = jul_1[jul_1['labels'] == 2]
jul_1_3 = jul_1[jul_1['labels'] == 3]
jul_1_4 = jul_1[jul_1['labels'] == 4]
jul_1_5 = jul_1[jul_1['labels'] == 5]
jul_1_6 = jul_1[jul_1['labels'] == 6]
jul_1_7 = jul_1[jul_1['labels'] == 7]
jul_1_8 = jul_1[jul_1['labels'] == 8]
jul_1_9 = jul_1[jul_1['labels'] == 9]
jul_1_unlabeled = jul_1[jul_1['labels'] == -1]

In [21]:
# plt.scatter(pw_time['powered_longitude'], pw_time['powered_latitude'], c='b', label='powered')
plt.figure(figsize=(10,10))
plt.scatter(jul_1_unlabeled['location_longitude'], jul_1_unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(jul_1_0['location_longitude'], jul_1_0['location_latitude'], label='cluster 0')
plt.scatter(jul_1_1['location_longitude'], jul_1_1['location_latitude'], label='cluster 1')
plt.scatter(jul_1_2['location_longitude'], jul_1_2['location_latitude'], label='cluster 2')
plt.scatter(jul_1_3['location_longitude'], jul_1_3['location_latitude'], label='cluster 3')
plt.scatter(jul_1_4['location_longitude'], jul_1_4['location_latitude'], label='cluster 4')
plt.scatter(jul_1_5['location_longitude'], jul_1_5['location_latitude'], label='cluster 5')
plt.scatter(jul_1_6['location_longitude'], jul_1_6['location_latitude'], label='cluster 6')
plt.scatter(jul_1_7['location_longitude'], jul_1_7['location_latitude'], label='cluster 7')
plt.scatter(jul_1_8['location_longitude'], jul_1_8['location_latitude'], label='cluster 8')
plt.scatter(jul_1_9['location_longitude'], jul_1_9['location_latitude'], label='cluster 9')
plt.title('Clustered Outages on 7/1/18 00:00:00 - 00:02:00')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')

In [22]:
jul_1[jul_1['location_latitude'] > 5.66]['time'].values

In [146]:
#to examine a difference in classification, let's zoom in at the data in the top cluster of this plot 
#print out the distance in time between the points 
a_time = jul_1[jul_1['location_latitude'] > 5.66]['time'].values[0]
for i in jul_1[jul_1['location_latitude'] > 5.66]['time'].values[1:]: 
    print(abs(a_time - i))

In [174]:
#now let's compare that with all the points in cluster 2
time1 = jul_1[jul_1['labels'] == 2]['time'].values[0]
for i in jul_1[jul_1['labels'] == 2]['time'].values[1:]: 
    print((time1 - i))

In [148]:
#now let's compare that with all the points in cluster 4
time4 = jul_1[jul_1['labels'] == 3]['time'].values[0]
for i in jul_1[jul_1['labels'] == 3]['time'].values[1:]: 
    print(abs(time4 - i))

In [149]:
#now let's compare that with all the points in cluster 1
time0 = jul_1[jul_1['labels'] == 0]['time'].values[0]
for i in jul_1[jul_1['labels'] == 0]['time'].values[1:]: 
    print(abs(time0 - i))

In [89]:
b_time = jul_1[jul_1['location_latitude'] < 5.60]['time'].values[0]
for i in jul_1[jul_1['location_latitude'] < 5.60]['time'].values[1:]: 
    print((b_time - i))

In [13]:
plt.scatter(pw_time['powered_longitude'], pw_time['powered_latitude'], c='b', label='powered')
plt.scatter(pw_time['outage_longitude'], pw_time['outage_latitude'], c='r', label='outage')
plt.title('Powered Sensors vs Outages')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')