In [76]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timezone
from datetime import timedelta
import pyproj
import seaborn as sns

In [77]:
#run this to initiate the STDBSCAN class 

class STDBSCAN(object):

    def __init__(self, col_lat, col_lon, col_time, spatial_threshold=500.0, 
                 temporal_threshold=60.0, min_neighbors=15):
        """
        Python st-dbscan implementation.
        :param col_lat: Latitude column name;
        :param col_lon:  Longitude column name;
        :param col_time: Date time column name;
        :param spatial_threshold: Maximum geographical coordinate (spatial)
             distance value (meters);
        :param temporal_threshold: Maximum non-spatial distance value (seconds);
        :param min_neighbors: Minimum number of points within Eps1 and Eps2
             distance;
        """
        self.col_lat = col_lat
        self.col_lon = col_lon
        self.col_time = col_time
        self.spatial_threshold = spatial_threshold
        self.temporal_threshold = temporal_threshold
        self.min_neighbors = min_neighbors

    def projection(self, df, p1_str='epsg:4326', p2_str='epsg:3395'):
        """
        Cython wrapper to converts from geographic (longitude,latitude)
        to native map projection (x,y) coordinates. It needs to select the
        right epsg. Values of x and y are given in meters
        """
        p1 = pyproj.Proj(init=p1_str)
        p2 = pyproj.Proj(init=p2_str)
        lon = df[self.col_lon].values
        lat = df[self.col_lat].values
        x1, y1 = p1(lon, lat)
        x2, y2 = pyproj.transform(p1, p2, x1, y1, radians=True)
        df[self.col_lon] = x2
        df[self.col_lat] = y2

        print(df)
        return df

    def _retrieve_neighbors(self, index_center, matrix):

        center_point = matrix[index_center, :]

        # filter by time
        min_time = center_point[2] - timedelta(seconds=self.temporal_threshold)
        max_time = center_point[2] + timedelta(seconds=self.temporal_threshold)
        matrix = matrix[(matrix[:, 2] >= min_time) &
                        (matrix[:, 2] <= max_time), :]
        # filter by distance
        tmp = (matrix[:, 0]-center_point[0])*(matrix[:, 0]-center_point[0]) + \
            (matrix[:, 1]-center_point[1])*(matrix[:, 1]-center_point[1])
        neigborhood = matrix[tmp <= (
            self.spatial_threshold*self.spatial_threshold), 4].tolist()
        neigborhood.remove(index_center)

        return neigborhood

    def run(self, df):
        """
        INPUTS:
            df={o1,o2,...,on} Set of objects;
        OUTPUT:
            C = {c1,c2,...,ck} Set of clusters
        """
        cluster_label = 0
        noise = -1
        unmarked = 777777
        stack = []

        # initial setup
        df = df[[self.col_lon, self.col_lat, self.col_time]]
        df = df.assign(cluster=unmarked)
        df['index'] = range(df.shape[0])
        matrix = df.values
        df.drop(['index'], inplace=True, axis=1)

        # for each point in database
        for index in range(matrix.shape[0]):
            if matrix[index, 3] == unmarked:
                neighborhood = self._retrieve_neighbors(index, matrix)

                if len(neighborhood) < self.min_neighbors:
                    matrix[index, 3] = noise
                else:  # found a core point
                    cluster_label += 1
                    # assign a label to core point
                    matrix[index, 3] = cluster_label

                    # assign core's label to its neighborhood
                    for neig_index in neighborhood:
                        matrix[neig_index, 3] = cluster_label
                        stack.append(neig_index)  # append neighbors to stack

                    # find new neighbors from core point neighborhood
                    while len(stack) > 0:
                        current_point_index = stack.pop()
                        new_neighborhood = \
                            self._retrieve_neighbors(current_point_index,
                                                     matrix)

                        # current_point is a new core
                        if len(new_neighborhood) >= self.min_neighbors:
                            for neig_index in new_neighborhood:
                                neig_cluster = matrix[neig_index, 3]
                                if any([neig_cluster == noise,
                                        neig_cluster == unmarked]):
                                    matrix[neig_index, 3] = cluster_label
                                    stack.append(neig_index)

        df['cluster'] = matrix[:, 3]
        return df

In [90]:
def parse_dates(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f')

def plot_clusters(df, output_name):
    import matplotlib.pyplot as plt

    labels = df['cluster'].values
    X = df[['longitude', 'latitude']].values

    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_member_mask = (labels == k)

        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=6)

    plt.title('ST-DSCAN: #n of clusters {}'.format(len(unique_labels)))
    plt.show()
    # plt.savefig(output_name)


def test_time(df):
    '''
    transfrom the lon and lat to x and y
    need to select the right epsg
    I don't the true epsg of sample, but get the same result by using 
    epsg:4326 and epsg:32635
    '''
    st_dbscan = STDBSCAN(col_lat='location_latitude', col_lon='location_longitude',
                         col_time='time', spatial_threshold=500,
                         temporal_threshold=60, min_neighbors=2)
    #df = st_dbscan.projection(df, p1_str='epsg:4326', p2_str='epsg:32630')
    result_t180 = st_dbscan.run(df)
    return result_t180


In [166]:
#these are the transition points of the outages 
outages = pd.read_parquet('part-00000-3c7aa0ea-41c7-4705-bafc-5662f2051563-c000.gz.parquet')
outages['time'] = outages['outage_time'].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
outages['time'] = pd.to_datetime(outages['time'], infer_datetime_format=True)
outages.head()

In [168]:
if __name__ == '__main__':
    df = pd.DataFrame(test_time(outages))
    print(pd.value_counts(df['cluster']))

In [169]:
clustered = pd.DataFrame(test_time(outages))
clustered.head()

In [170]:
def find_range(lst):
    return max(lst) - min(lst)

clustered['time'] = clustered['time'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))
month_clust = pd.DataFrame(clustered.groupby(['cluster'])['time'].apply(lambda x: x.tolist()))
month_clust['latitude'] = clustered.groupby(['cluster'])['location_latitude'].apply(lambda x: x.tolist()).values
month_clust['longitude'] = clustered.groupby(['cluster'])['location_longitude'].apply(lambda x: x.tolist()).values
month_clust = month_clust.iloc[1:]

month_clust['time_range'] = (np.vectorize(find_range)(month_clust['time']))
month_clust['lat_range'] = (np.vectorize(find_range)(month_clust['latitude']))
month_clust['long_range'] = (np.vectorize(find_range)(month_clust['longitude']))

month_clust.head()

In [195]:
plt.hist(month_clust['time_range'], bins=30)
plt.title('STDBSCAN: A Distribution for the Range of Times in a Cluster')
plt.xlabel('Time Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [193]:
plt.hist(month_clust['lat_range'])
plt.title('STDBSCAN: A Distribution for the Range of Latitude in a Cluster')
plt.xlabel('Latitude Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [194]:
plt.hist(month_clust['long_range'])
plt.title('STDBSCAN: A Distribution for the Range of Longitude in a Cluster')
plt.xlabel('Longitude Range in a Cluster')
plt.ylabel('Percentage of Clusters')

In [197]:
clustered.head()

In [198]:
#this is all of the data from July 2018 (not just outage transition points!) 
#only use this for SAIFI 
pw = pd.read_parquet('part-00000-602cb425-c6be-40be-8024-aeb92fcb4315-c000.gz.parquet')
pw.head()

In [199]:
#now calculate SAIFI 
st_SAIFI_num = len(clustered[clustered['cluster'] != -1])
st_SAIFI_denom = len(pw['core_id'].unique())*(len(month_clust))
st_SAIFI = st_SAIFI_num/st_SAIFI_denom
st_SAIFI

In [120]:
#now try it on 2 days of data to do the analysis that I was performing in evaluating_clustering_algorithm
days = outages[outages['outage_time'] <= min(outages['outage_time'])+172800]
days = pd.DataFrame(test_time(days))
days.head()

In [192]:
days = days.rename(columns={'cluster': 'labels'})
day_a = days[days['labels'] == 0]
day_b = days[days['labels'] == 1]
day_c = days[days['labels'] == 2]
day_d = days[days['labels'] == 3]
day_e = days[days['labels'] == 4]
unlabeled = days[days['labels'] == -1]
plt.figure(figsize=(10,10))
plt.scatter(unlabeled['location_longitude'], unlabeled['location_latitude'], c='y',label='noise')
plt.scatter(day_b['location_longitude'], day_b['location_latitude'], label='cluster 1')
plt.scatter(day_c['location_longitude'], day_c['location_latitude'], label='cluster 2')
plt.scatter(day_d['location_longitude'], day_d['location_latitude'], label='cluster 3')
plt.scatter(day_e['location_longitude'], day_e['location_latitude'], label='cluster 4')
plt.title('STDBSCAN Clustered Outages from 7/1/18 - 7/2/18')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.xlim(left, right)
plt.ylim(top, bottom)

In [186]:
def find_range(lst):
    return max(lst) - min(lst)

#days['time'] = days['time'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))
days_clust = pd.DataFrame(days.groupby(['labels'])['time'].apply(lambda x: x.tolist())).rename(columns={'time': 'outage_times'})
days_clust['latitude'] = days.groupby(['labels'])['location_latitude'].apply(lambda x: x.tolist()).values
days_clust['longitude'] = days.groupby(['labels'])['location_longitude'].apply(lambda x: x.tolist()).values
days_clust = days_clust.iloc[1:]

days_clust['time_range'] = (np.vectorize(find_range)(days_clust['outage_times']))
days_clust['lat_range'] = (np.vectorize(find_range)(days_clust['latitude']))
days_clust['long_range'] = (np.vectorize(find_range)(days_clust['longitude']))

days_clust.head()

In [162]:
days_clust['time'].values[2]

In [196]:
sns.distplot(days_clust['outage_times'].values[3], label='cluster_4')
sns.distplot([1530449581, 1530449581, 1530449582], label='cluster_3')
sns.distplot(days_clust['outage_times'].values[1], label='cluster_2')
sns.distplot(days_clust['outage_times'].values[0], label='cluster_1')
plt.legend()
plt.title('STDBSCAN Clustering Distributions')