In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timezone
from datetime import timedelta
import pyproj
import seaborn as sns
import geopandas as gpd 
import shapely.geometry as geometry
import shapely.ops as ops 
from functools import partial 

In [2]:
#run this to initiate the STDBSCAN class 

class STDBSCAN(object):

    def __init__(self, col_lat, col_lon, col_time, spatial_threshold=500.0, 
                 temporal_threshold=60.0, min_neighbors=15):
        """
        Python st-dbscan implementation.
        :param col_lat: Latitude column name;
        :param col_lon:  Longitude column name;
        :param col_time: Date time column name;
        :param spatial_threshold: Maximum geographical coordinate (spatial)
             distance value (meters);
        :param temporal_threshold: Maximum non-spatial distance value (seconds);
        :param min_neighbors: Minimum number of points within Eps1 and Eps2
             distance;
        """
        self.col_lat = col_lat
        self.col_lon = col_lon
        self.col_time = col_time
        self.spatial_threshold = spatial_threshold
        self.temporal_threshold = temporal_threshold
        self.min_neighbors = min_neighbors

    def projection(self, df, p1_str='epsg:4326', p2_str='epsg:3395'):
        """
        Cython wrapper to converts from geographic (longitude,latitude)
        to native map projection (x,y) coordinates. It needs to select the
        right epsg. Values of x and y are given in meters
        """
        p1 = pyproj.Proj(init=p1_str)
        p2 = pyproj.Proj(init=p2_str)
        lon = df[self.col_lon].values
        lat = df[self.col_lat].values
        x1, y1 = p1(lon, lat)
        x2, y2 = pyproj.transform(p1, p2, x1, y1, radians=True)
        df[self.col_lon] = x2
        df[self.col_lat] = y2

        print(df)
        return df

    def _retrieve_neighbors(self, index_center, matrix):

        center_point = matrix[index_center, :]

        # filter by time
        min_time = center_point[2] - timedelta(seconds=self.temporal_threshold)
        max_time = center_point[2] + timedelta(seconds=self.temporal_threshold)
        matrix = matrix[(matrix[:, 2] >= min_time) &
                        (matrix[:, 2] <= max_time), :]
        # filter by distance
        tmp = (matrix[:, 0]-center_point[0])*(matrix[:, 0]-center_point[0]) + \
            (matrix[:, 1]-center_point[1])*(matrix[:, 1]-center_point[1])
        neigborhood = matrix[tmp <= (
            self.spatial_threshold*self.spatial_threshold), 4].tolist()
        neigborhood.remove(index_center)

        return neigborhood

    def run(self, df):
        """
        INPUTS:
            df={o1,o2,...,on} Set of objects;
        OUTPUT:
            C = {c1,c2,...,ck} Set of clusters
        """
        cluster_label = 0
        noise = -1
        unmarked = 777777
        stack = []

        # initial setup
        df = df[[self.col_lon, self.col_lat, self.col_time]]
        df = df.assign(cluster=unmarked)
        df['index'] = range(df.shape[0])
        matrix = df.values
        df.drop(['index'], inplace=True, axis=1)

        # for each point in database
        for index in range(matrix.shape[0]):
            if matrix[index, 3] == unmarked:
                neighborhood = self._retrieve_neighbors(index, matrix)

                if len(neighborhood) < self.min_neighbors:
                    matrix[index, 3] = noise
                else:  # found a core point
                    cluster_label += 1
                    # assign a label to core point
                    matrix[index, 3] = cluster_label

                    # assign core's label to its neighborhood
                    for neig_index in neighborhood:
                        matrix[neig_index, 3] = cluster_label
                        stack.append(neig_index)  # append neighbors to stack

                    # find new neighbors from core point neighborhood
                    while len(stack) > 0:
                        current_point_index = stack.pop()
                        new_neighborhood = \
                            self._retrieve_neighbors(current_point_index,
                                                     matrix)

                        # current_point is a new core
                        if len(new_neighborhood) >= self.min_neighbors:
                            for neig_index in new_neighborhood:
                                neig_cluster = matrix[neig_index, 3]
                                if any([neig_cluster == noise,
                                        neig_cluster == unmarked]):
                                    matrix[neig_index, 3] = cluster_label
                                    stack.append(neig_index)

        df['cluster'] = matrix[:, 3]
        return df

In [3]:
def parse_dates(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f')

def plot_clusters(df, output_name):
    import matplotlib.pyplot as plt

    labels = df['cluster'].values
    X = df[['longitude', 'latitude']].values

    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_member_mask = (labels == k)

        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=6)

    plt.title('ST-DSCAN: #n of clusters {}'.format(len(unique_labels)))
    plt.show()
    # plt.savefig(output_name)


def test_time(df):
    '''
    transfrom the lon and lat to x and y
    need to select the right epsg
    I don't the true epsg of sample, but get the same result by using 
    epsg:4326 and epsg:32635
    '''
    st_dbscan = STDBSCAN(col_lat='location_latitude', col_lon='location_longitude',
                         col_time='time', spatial_threshold=0.03,
                         temporal_threshold=60, min_neighbors=1)
    #df = st_dbscan.projection(df, p1_str='epsg:4326', p2_str='epsg:32630')
    result_t180 = st_dbscan.run(df)
    return result_t180


In [4]:
#these are the transition points of the outages 
outages = pd.read_parquet('part-00000-3c7aa0ea-41c7-4705-bafc-5662f2051563-c000.gz.parquet')
outages['time'] = outages['outage_time'].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
outages['time'] = pd.to_datetime(outages['time'], infer_datetime_format=True)
outages.head()

In [5]:
if __name__ == '__main__':
    df = pd.DataFrame(test_time(outages))
    print(pd.value_counts(df['cluster']))

In [6]:
clustered = pd.DataFrame(test_time(outages))
clustered['core_id'] = outages['core_id']

def find_range(lst):
    return max(lst) - min(lst)

clustered['time'] = clustered['time'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))
month_clust = pd.DataFrame(clustered.groupby(['cluster'])['time'].apply(lambda x: x.tolist()))
month_clust['latitude'] = clustered.groupby(['cluster'])['location_latitude'].apply(lambda x: x.tolist()).values
month_clust['longitude'] = clustered.groupby(['cluster'])['location_longitude'].apply(lambda x: x.tolist()).values
month_clust['ids'] = clustered.groupby(['cluster'])['core_id'].apply(lambda x: x.tolist()).values
month_clust = month_clust.iloc[1:]

month_clust['time_range'] = (np.vectorize(find_range)(month_clust['time']))
month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))

month_clust.head()

In [7]:
month_clust['cluster_size'] = clustered.groupby(['cluster'])['time'].count()
month_clust['min_time'] = month_clust['time'].apply(lambda x: min(x))
month_clust['max_time'] = month_clust['time'].apply(lambda x: max(x))
month_clust.head()

In [8]:
plt.hist(month_clust['time_range'], bins=30)
plt.title('STDBSCAN: A Distribution for the Range of Times in a Cluster')
plt.xlabel('Time Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [9]:
plt.hist(month_clust['lat_range'])
plt.title('STDBSCAN: A Distribution for the Range of Latitude in a Cluster')
plt.xlabel('Latitude Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [10]:
plt.hist(month_clust['long_range'])
plt.title('STDBSCAN: A Distribution for the Range of Longitude in a Cluster')
plt.xlabel('Longitude Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [11]:
#this is all of the data from July 2018 (not just outage transition points!) 
#only use this for SAIFI (and convex hull caclulations)
#I have confirmed that there are no duplicate points that read as powered and not powered 
pw = pd.read_parquet('part-00000-602cb425-c6be-40be-8024-aeb92fcb4315-c000.gz.parquet').drop(['product_id', 'millis', 'last_unplug_millis'], axis=1)
pw.head()

In [12]:
#now calculate SAIFI 
st_SAIFI_num = len(clustered[clustered['cluster'] != -1])
st_SAIFI_denom = len(pw['core_id'].unique())*(len(month_clust))
st_SAIFI = st_SAIFI_num/st_SAIFI_denom
st_SAIFI

In [16]:
#check if powered points are within the time range of the clusters 
#we will start by computing the lat and long values for powered sensors that reported within the time range of the outage
#then, later we will compare these powered coords to see if they are also within the convex hull of the outage
powered = pw[~pw['is_powered']]
powered['time'] = powered['time'].apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

within_time_lat = []
within_time_long = []
for o_index in range(len(month_clust)): 
    nest_lat = []
    nest_long= []
    for p_index in range(len(powered)): 
        if powered['time'].values[p_index] >= month_clust['min_time'].values[o_index] and powered['time'].values[p_index] <= month_clust['max_time'].values[o_index]:
            nest_lat.append(powered['location_latitude'].values[p_index])
            nest_long.append(powered['location_longitude'].values[p_index])        
    within_time_lat.append(nest_lat)
    within_time_long.append(nest_long)
            
month_clust['within_time_lat'] = within_time_lat
month_clust['within_time_long'] = within_time_long
month_clust.head()

In [17]:
#for clusters of 2, duplicate the lat and long points so that these points can also be converted into a Polygon by geopandas 
update_lat = month_clust['latitude'].copy()
update_long = month_clust['longitude'].copy()
update_within_lat = month_clust['within_time_lat'].copy()
update_within_long = month_clust['within_time_long'].copy()

for i in range(len(month_clust)): 
    if len(month_clust.iloc[i]['within_time_lat']) == 0: 
        update_within_lat.values[i] = [0, 1, 2]
        update_within_long.values[i] = [0, 1, 2] 
    if len(update_within_lat.values[i]) < 3: 
        update_within_lat.values[i] = month_clust.iloc[i]['within_time_lat']*3
        update_within_long.values[i] = month_clust.iloc[i]['within_time_long']*3        
    if month_clust.iloc[i]['cluster_size'] < 3: 
        update_lat.values[i] = month_clust.iloc[i]['latitude']*2
        update_long.values[i] = month_clust.iloc[i]['longitude']*2

month_clust['latitude'] = update_lat
month_clust['longitude'] = update_long
month_clust['within_time_long'] = update_within_long
month_clust['within_time_lat'] = update_within_lat
month_clust.head()

In [18]:
#create geodataframes to calculate convex hull 
power = month_clust.copy()
out = month_clust.copy()
powered_poly = []
outage_poly = []
for i in range(len(month_clust)):
    a = month_clust.iloc[i, :]['within_time_long']
    b = month_clust.iloc[i, :]['within_time_lat']
    c = month_clust.iloc[i, :]['longitude']
    d = month_clust.iloc[i, :]['latitude']
    powered_poly.append(list(zip(a, b)))
    outage_poly.append(list(zip(c, d)))
    
def unique_coords(coords):
    return pd.Series(coords).unique()

power['powered_poly'] = powered_poly
out['powered_poly'] = powered_poly
month_clust['powered_poly'] = powered_poly
out['outage_poly'] = outage_poly
power['outage_poly'] = outage_poly
month_clust['outage_poly'] = outage_poly
crs = {'init', 'epsg:4326'}

powered_poly = [geometry.Polygon(x, holes=None) for x in power['powered_poly']]
power = gpd.GeoDataFrame(power, crs=crs, geometry=(powered_poly))

outage_poly = [geometry.Polygon(x, holes=None) for x in out['outage_poly']]
out= gpd.GeoDataFrame(out, crs=crs, geometry=(outage_poly))


power['powered_poly'] = (np.vectorize(unique_coords)(power['powered_poly']))
out['powered_poly'] = (np.vectorize(unique_coords)(out['powered_poly']))
month_clust['powered_poly'] = (np.vectorize(unique_coords)(month_clust['powered_poly']))
out['outage_poly'] = (np.vectorize(unique_coords)(out['outage_poly']))
power['outage_poly'] = (np.vectorize(unique_coords)(power['outage_poly']))
month_clust['outage_poly'] = (np.vectorize(unique_coords)(month_clust['outage_poly']))

power['convex_area_powered'] = power.convex_hull
out['convex_area_outage'] = out.convex_hull

out.head()



In [19]:
#calculate the convex hull 
def in_convex_hull(powered_coords, geom):
#takes in lat/long pairs in powered_coords, and a Polygon to chekc if the point is within the convex hull of the Polygon 
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(out['powered_poly'].values[i], out['geometry'].values[i]) for i in range(len(out))]
out['powered_within_outage'] = in_convex_hull
out


In [20]:
#plot size of outage vs. % out at within the convex hull 
def outage_size(outage_coords): 
    return len(pd.Series(outage_coords).unique())

out['powered_size_within_outage_area'] = (np.vectorize(outage_size)(out['powered_within_outage']))
out['percent_pow_within_outage'] = (out['powered_size_within_outage_area'] / (out['powered_size_within_outage_area'] + out['cluster_size']))*100

plt.figure(figsize=(10,10))
sns.scatterplot(x='cluster_size', y='percent_pow_within_outage', data=out)
plt.title('STDBSCAN: Number of Sensors in Outage vs. Percent of Sensors Experiencing Outage Within the Convex Hull of the Outage')
plt.xlabel('Number of Sensors in an Outage')
plt.ylabel('Percentage of Sensors Powered within Convex Hull')

plt.ylim((-2.863820561337118, 60.00667770419427))

In [21]:
plt.hist(out['percent_pow_within_outage'], bins=20)
plt.title('STDBSCAN: percentage of powered sensors within the convex hull of an outage')



In [25]:
# top_right = out[out['percent_pow_within_outage'] > 9]
# la = []
# lo = [] 
# for i in range(len(top_right['powered_within_outage'].values[0])): 
#     la.append(top_right['powered_within_outage'].values[0][i][1])
#     lo.append(top_right['powered_within_outage'].values[0][i][0])
# len(la)
# sns.scatterplot(x=lo, y=la, label='powered')
# sns.scatterplot(x=top_right['longitude'].values[0], y=top_right['latitude'].values[0], label='outage')
# plt.title('Top Right Point: Likely More than One Concurrent Outage')
# plt.xlabel('longitude')
# plt.ylabel('latitude')

In [26]:
#now let's make some new dataframes so that we can calculate the distances between the sensors of 2-3 that were clustered together 
c = clustered.groupby('cluster').count()
pair_index = c[c['time'] == 2].index

def calc_dist(dist1, dist2): 
#this function takes two geometric points in and computes the distance in meters between them 
    one = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist1)
    two = ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea')), dist2)
    return one.distance(two)

#distances for clusters of 2: 
pairs = clustered[clustered['cluster'].isin(pair_index)]
gdf = gpd.GeoDataFrame(
    pairs, geometry=gpd.points_from_xy(pairs.location_longitude, pairs.location_latitude), crs={'init':'epsg:4326'})
dist_1 = gpd.GeoSeries(gdf.groupby('cluster')['geometry'].first(), crs={'init':'aea'})
dist_2 = gpd.GeoSeries(gdf.groupby('cluster')['geometry'].last(), crs={'init':'aea'})
distances = (np.vectorize(calc_dist)(dist_1, dist_2))

#distances for clusters of 3: 
trio_index = c[c['time'] == 3].index
trios = clustered[clustered['cluster'].isin(trio_index)]
gdf_trios = gpd.GeoDataFrame(
    trios, geometry=gpd.points_from_xy(trios.location_longitude, trios.location_latitude), crs={'init':'epsg:4326'})
dist_1_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].first(), crs={'init':'aea'})
dist_2_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].nth(1), crs={'init':'aea'})
dist_3_t = gpd.GeoSeries(gdf_trios.groupby('cluster')['geometry'].last(), crs={'init':'aea'})

trios_distances = pd.DataFrame(np.vectorize(calc_dist)(dist_1_t, dist_2_t)).rename(columns={0: '1->2'})
trios_distances['2->3'] = (np.vectorize(calc_dist)(dist_2_t, dist_3_t))
trios_distances['3->1'] = (np.vectorize(calc_dist)(dist_3_t, dist_1_t))
#only one of the columns had all trios within 550 m so I took the average to see that 6/10 are below an average distance of 550
trios_distances['avg'] = (trios_distances['2->3'] + trios_distances['1->2'] + trios_distances['3->1'])/3
trios_distances

In [27]:
dist_for_3 = list(trios_distances['1->2'].values) + list(trios_distances['2->3'].values) + list(trios_distances['3->1'].values)
plt.hist(dist_for_3, bins=20)
plt.axvline(x=550, label='550m cutoff', c='r')
plt.title('STDBSCAN: Distances between sensors that are clustered as trios')
plt.legend()

In [28]:
plt.hist(trios_distances['avg'], bins=15)
plt.axvline(x=550, label='550m cutoff', c='r')
plt.title('STDBSCAN: Average distance between sensors that are clustered as trios')
plt.legend()

In [29]:
plt.hist(distances)
plt.axvline(x=550, label='550m cutoff', c='r')
plt.title('STDBSCAN: Distances between sensors that are clustered as pairs')
plt.legend()

In [30]:
#now let's caclulate the percentage to the left of the red line for each of the graphs:
#percentage under the cutoff for pairs: 
len(distances[distances < 550])/len(distances)

In [31]:
#percentage under the cutoff for trios: 
dist_for_3 = np.array(dist_for_3)
len(dist_for_3[dist_for_3 < 550])/len(dist_for_3)

In [32]:
#percentage under the cutoff for average trio distance: 
len(trios_distances[trios_distances['avg'] < 550]['avg'])/len(trios_distances)

In [33]:
#now let's do this same analysis with the logical grid distance as our distance metric 
logical = pd.read_csv('/Users/emilypaszkiewicz17/gridwatch-data-analysis/grid_distance.csv')
logical = logical.reset_index()
logical

In [34]:
# def retrieve_logic(lst):
#     logic = []
#     index=0
#     for p1 in lst:
#         nest = []
#         for p2 in lst: 
#             if list(logical[(logical['level_0'] == p1) & (logical['level_1'] == p2)]['level_0'].values) != [] :
#                 nest.append(logical[(logical['level_0'] == p1) & (logical['level_1'] == p2)]['logical_grid_distance'].values[0])
#         logic.append(nest)
#         index += 1 
#     return logic 

# def avg_grid(lst):
#     avg = []
#     for i in lst: 
#         avg.append(np.average(i))
#     return np.average(avg)

In [35]:
#be ware this takes forever to run!
#month_clust['avg_logical_dist'] = (np.vectorize(retrieve_logic)(month_clust['ids']))

In [38]:
two_ids = month_clust[month_clust['cluster_size'] == 2]
three_ids =  month_clust[month_clust['cluster_size'] == 3]
pair_logical_dist=[]
trio_logical_dist_1=[]
trio_logical_dist_2=[]
trio_logical_dist_3=[]
for i in range(len(two_ids)): 
    id_1 = two_ids['ids'].values[i][0]
    id_2 = two_ids['ids'].values[i][1]
    pair_logical_dist.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])

for i in range(len(three_ids)):
    id_1 = three_ids['ids'].values[i][0]
    id_2 = three_ids['ids'].values[i][1]
    id_3 = three_ids['ids'].values[i][2]
    trio_logical_dist_1.append(logical[(logical['level_0'] == id_1) & (logical['level_1'] == id_2)]['logical_grid_distance'].values[0])
    trio_logical_dist_2.append(logical[(logical['level_0'] == id_2) & (logical['level_1'] == id_3)]['logical_grid_distance'].values[0])
    trio_logical_dist_3.append(logical[(logical['level_0'] == id_3) & (logical['level_1'] == id_1)]['logical_grid_distance'].values[0])

    
two_ids['logical_distance']= pair_logical_dist
three_ids['log_dist_1'] = trio_logical_dist_1
three_ids['log_dist_2'] = trio_logical_dist_2
three_ids['log_dist_3'] = trio_logical_dist_3

In [39]:
len(two_ids[two_ids['logical_distance'] ==1])/len(two_ids)

In [41]:
log_dist_for_3 = list(three_ids['log_dist_1']) + list(three_ids['log_dist_2']) + list(three_ids['log_dist_3'])
log_dist_for_3 = pd.Series(log_dist_for_3)
len(log_dist_for_3[log_dist_for_3 == 1])/len(log_dist_for_3)

In [43]:
#attempt at plotting time distribution/range v num sensors for each cluster 
for i in range(len(month_clust)): 
    sns.distplot(month_clust['time'].values[i])
plt.title('STDBSCAN Clustering Distributions')

TRIMODAL DIST 

In [53]:
clust_sizes = pd.DataFrame(month_clust.groupby('cluster_size')['ids'].nunique()).reset_index()
clust_sizes
sns.barplot(x='cluster_size', y='ids', data=clust_sizes)
plt.ylabel('Number of Clusters')
plt.xlabel('Cluster Size')
plt.title('Cluster Size vs. Number of Clusters of this Size')

In [55]:
out.head()

In [56]:
clustered.head()