In [1]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import math
import pyproj
import descartes
from descartes import PolygonPatch
import geopandas as gpd 
import shapely.geometry as geometry
import shapefile 
from functools import partial 
import shapely.ops as ops 
import fiona

In [2]:
pw = pd.read_parquet('part-00000-e918c4d9-9091-44a0-912f-f9ebdad81161-c000.gz.parquet')
pw = pw.rename(columns={'a_location.location_latitude': 'outage_latitude', 'a_location.location_longitude' : 'outage_longitude'}).drop(['outage_location', 'is_powered', 'core_id'], axis=1)
street_map = gpd.read_file('/Users/emilypaszkiewicz17/Desktop/research/map_w_coordinates/GHA-4_admin_SHP/GHA-4.shp')

In [3]:
print(len(pw))
pw.head()

In [4]:
#let's plot which sensors are powered vs. which sensors are not powered at one specific time frame 
time0 = pw[pw['time'] == '2018-07-01 12:53:00']
plt.scatter(time0['powered_longitude'], time0['powered_latitude'], c='b', label='powered')
plt.scatter(time0['outage_longitude'], time0['outage_latitude'], c='r', label='outage')
plt.title('Powered Sensors vs Outages 2018-07-01 12:53:00')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')

In [5]:
#now let's plot the same thing with a map behind it 
fig,ax = plt.subplots(figsize = (20,20))
street_map.plot(ax=ax)
time0 = pw[pw['time'] == '2018-07-01 12:53:00']
plt.scatter(time0['powered_longitude'], time0['powered_latitude'], c='y', label='powered')
plt.scatter(time0['outage_longitude'], time0['outage_latitude'], c='r', label='outage')
plt.title('Powered Sensors vs Outages 2018-07-01 12:53:00')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')

In [6]:
#lets try zooming in on the outages
plt.scatter(time0['powered_longitude'], time0['powered_latitude'], c='b', label='powered')
plt.scatter(time0['outage_longitude'], time0['outage_latitude'], c='r', label='outage')
plt.title('Powered Sensors vs Outages 2018-07-01 12:53:00')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.ylim(5.6, 5.63)
plt.xlim(-0.3, -0.24)

In [7]:
#let's write a  function that will take in a dataframe, date&time, and bounding coordinates and will plot the powered and outage sensors 
#this function will assume that the dataframe has a time column called "time" and columns of coordinates with one value per entry with the names: 'powered_longitude', 'powered_latitude', 'outage_latitude', 'outage_longitude'
def plot_sensor_outages(df, time, ylim=None, xlim=None):
    time_frame = df[df['time'] == time]
    plt.scatter(time_frame['powered_longitude'], time_frame['powered_latitude'], c='b', label='powered')
    plt.scatter(time_frame['outage_longitude'], time_frame['outage_latitude'], c='r', label='outage')
    plt.title('Powered Sensors vs Outages: {}'.format(time))
    plt.legend()
    plt.xlabel('longitude')
    plt.ylabel('latitude')
    plt.ylim(ylim)
    plt.xlim(xlim)


In [8]:
#this removes points that are both powered and outages 
pw['powered_pair'] = list(zip(pw['powered_longitude'], pw['powered_latitude']))
pw['outage_pair'] = list(zip(pw['outage_longitude'], pw['outage_latitude']))
pw_no_duplicates = pd.DataFrame(columns=['time', 'powered_longitude', 'powered_latitude', 'outage_latitude',
       'outage_longitude', 'powered_pair', 'outage_pair'])
pw_times = list(pw['time'].unique())
for time in pw_times:
    current_time = pw[pw['time'] == time]
    unique_outages = list(current_time['outage_pair'].unique())
    pw_no_duplicates = pw_no_duplicates.append(current_time[~current_time['powered_pair'].isin(unique_outages)])

pw = pw_no_duplicates 
pw

In [20]:
time0 = pw[pw['time'] == '2018-07-01 12:53:00']
plt.scatter(time0['powered_longitude'], time0['powered_latitude'], c='b', label='powered')
plt.scatter(time0['outage_longitude'], time0['outage_latitude'], c='r', label='outage')
plt.title('Powered Sensors vs Outages 2018-07-01 12:53:00')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')

In [21]:
#the dataframe pw_ta contatins the coordinates for powered and not_powered sensors at every time stamp (rounded to the nearest minute)
pw_ta = pw.groupby('time')['powered_longitude'].apply(lambda x: x.to_list()).reset_index()
pw_ta['powered_latitude'] = pw.groupby('time')['powered_latitude'].apply(lambda x: x.to_list()).values
pw_ta['outage_longitude'] = pw.groupby('time')['outage_longitude'].apply(lambda x: x.to_list()).values
pw_ta['outage_latitude'] = pw.groupby('time')['outage_latitude'].apply(lambda x: x.to_list()).values
pw_ta.head()

In [22]:
powered = pw_ta.copy()
outage = pw_ta.copy()
powered_poly = []
outage_poly = []
for i in range(len(pw_ta)):
    a = pw_ta.iloc[i, :].values[1]
    b = pw_ta.iloc[i, :].values[2]
    c = pw_ta.iloc[i, :].values[3]
    d = pw_ta.iloc[i, :].values[4]
    powered_poly.append(list(zip(a, b)))
    outage_poly.append(list(zip(c, d)))
    
def unique_coords(coords):
    return pd.Series(coords).unique()

powered['powered_poly'] = powered_poly
outage['powered_poly'] = powered_poly
pw_ta['powered_poly'] = powered_poly
outage['outage_poly'] = outage_poly
powered['outage_poly'] = outage_poly
pw_ta['outage_poly'] = outage_poly
crs = {'init', 'epsg:4326'}

powered_poly = [geometry.Polygon(x, holes=None) for x in powered['powered_poly']]
powered = gpd.GeoDataFrame(powered, crs=crs, geometry=(powered_poly))

outage_poly = [geometry.Polygon(x, holes=None) for x in outage['outage_poly']]
outage= gpd.GeoDataFrame(outage, crs=crs, geometry=(outage_poly))


powered['powered_poly'] = (np.vectorize(unique_coords)(powered['powered_poly']))
outage['powered_poly'] = (np.vectorize(unique_coords)(outage['powered_poly']))
pw_ta['powered_poly'] = (np.vectorize(unique_coords)(pw_ta['powered_poly']))
outage['outage_poly'] = (np.vectorize(unique_coords)(outage['outage_poly']))
powered['outage_poly'] = (np.vectorize(unique_coords)(powered['outage_poly']))
pw_ta['outage_poly'] = (np.vectorize(unique_coords)(pw_ta['outage_poly']))

powered['convex_area_powered'] = powered.convex_hull
outage['convex_area_outage'] = outage.convex_hull

outage.head()


In [23]:
def convex_area(a_geom): 
    return ops.transform(partial(
        pyproj.transform, 
        pyproj.Proj(init='EPSG:4326'), 
        pyproj.Proj(proj='aea', lat1=a_geom.bounds[0], long1=a_geom.bounds[1], lat2=a_geom.bounds[2], long2=a_geom.bounds[3])), a_geom).convex_hull.area
powered['convex_area_powered'] = (np.vectorize(convex_area)(powered['convex_area_powered']))
outage['convex_area_outage'] = (np.vectorize(convex_area)(outage['convex_area_outage']))

pw_ta.head()

In [24]:
outage['convex_area_powered'] = powered['convex_area_powered']
powered['convex_area_outage'] = outage['convex_area_outage']


In [25]:
def in_convex_hull(powered_coords, geom):
    in_convex_hull = []
    for i in powered_coords: 
        if geom.convex_hull.contains(geometry.Point(i)):
            in_convex_hull.append(i)
    in_convex_hull = pd.Series(in_convex_hull).unique() 
    return in_convex_hull
        
in_convex_hull = [in_convex_hull(outage['powered_poly'].values[i], outage['geometry'].values[i]) for i in range(len(outage))]
outage['powered_within_outage'] = in_convex_hull
outage.head()


In [26]:
time8 = outage[outage['time'] == '2018-07-02 09:05:00']
time_0 = outage.iloc[0]
plt.scatter(time8['powered_longitude'].values[0], time8['powered_latitude'].values[0], c='b', label='powered')
plt.scatter(time8['outage_longitude'].values[0], time8['outage_latitude'].values[0], c='r', label='outage')
plt.title('Powered Sensors vs Outages 2018-07-01 12:53:00')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')

In [27]:
plt.scatter(time8['outage_longitude'].values[0], time8['outage_latitude'].values[0], c='r', label='outage')
plt.scatter(*zip(*(time8['powered_within_outage'].values)), c='b', label='powered')
plt.title('Powered Sensors vs Outages 2018-07-01 12:53:00')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.ylim(5.61, 5.67)
plt.xlim(-0.32, -0.27)

In [28]:
pw_ta = outage

In [29]:
pw_ta['percent_area_not_powered'] = (pw_ta['convex_area_outage']/(pw_ta['convex_area_outage'] + pw_ta['convex_area_powered']))*100
pw_ta.head()

In [30]:
#This takes into account all of the data points with more than 1 sensor reporting an outage 
plt.hist(pw_ta[pw_ta['percent_area_not_powered'] != 0]['percent_area_not_powered'])
plt.title('Percentage of Total Area not Powered')
plt.xlabel('(Area Not Powered / (Area Not Powered + Area Powered))*100')

In [31]:
unique = pw.groupby('time').nunique()
unique['outage_proportion'] = unique['outage_longitude']/(unique['outage_longitude']+unique['powered_longitude'])
pw_ta['percent_not_powered'] = (unique['outage_proportion'].values)*100
pw_ta.head()

In [33]:
#plot size of outage vs. % out at within the convex hull 
#in the convex hull 

len(pd.Series(pw_ta.loc[:, 'outage_poly'][1]).unique())
def outage_size(outage_coords): 
    return len(pd.Series(outage_coords).unique())

pw_ta['outage_size'] = (np.vectorize(outage_size)(pw_ta['outage_poly']))
pw_ta['powered_size_within_outage_area'] = (np.vectorize(outage_size)(pw_ta['powered_within_outage']))
pw_ta['percent_out_within_outage'] = (pw_ta['powered_size_within_outage_area'] / (pw_ta['powered_size_within_outage_area'] + pw_ta['outage_size']))*100

plt.figure(figsize=(10,10))
sns.scatterplot(x='outage_size', y='percent_out_within_outage', data=pw_ta)
plt.title('Number of Sensors in Outage vs. Percent of Sensors Experiencing Outage Within the Convex Hull of the Outage')
plt.xlabel('Number of Sensors in an Outage')
plt.ylabel('Percentage of Sensors Powered within Convex Hull')

#pick these points out and plot them 

In [34]:
#let's zoom in on some of these values and figure out what the most extreme values represent 
top_left = pw_ta[pw_ta['percent_out_within_outage'] == max(pw_ta['percent_out_within_outage'])]

zero_in_outage = pw_ta[(pw_ta['percent_out_within_outage'] == min(pw_ta['percent_out_within_outage']))]
bottom_right = zero_in_outage[zero_in_outage['outage_size'] == max(zero_in_outage['outage_size'])]

top_right = pw_ta[(pw_ta['percent_out_within_outage'] > 60) & (pw_ta['percent_out_within_outage'] < 61)]


In [35]:
sns.scatterplot(x=top_left['powered_longitude'].values[0], y=top_left['powered_latitude'].values[0], label='powered')
sns.scatterplot(x=top_left['outage_longitude'].values[0], y=top_left['outage_latitude'].values[0], label='outage')
plt.title('Top Left Point: Unrelated Concurrent Outages')
plt.xlabel('longitude')
plt.ylabel('latitude')

In [36]:
sns.scatterplot(x=top_right['powered_longitude'].values[0], y=top_right['powered_latitude'].values[0], label='powered')
sns.scatterplot(x=top_right['outage_longitude'].values[0], y=top_right['outage_latitude'].values[0], label='outage')
plt.title('Top Right Point: Likely More than One Concurrent Outage')
plt.xlabel('longitude')
plt.ylabel('latitude')

In [37]:
sns.scatterplot(x=bottom_right['powered_longitude'].values[0], y=bottom_right['powered_latitude'].values[0], label='powered')
sns.scatterplot(x=bottom_right['outage_longitude'].values[0], y=bottom_right['outage_latitude'].values[0], label='outage')
plt.title('Bottom Right Point: Likely a Singular Related Outage')
plt.xlabel('longitude')
plt.ylabel('latitude')

In [38]:
#plot a hist of the % out within convex hull at a given time 
plt.hist(pw_ta['percent_out_within_outage'], bins=40)
plt.title('Percentage of Area Powered Within Convex Hull of Outage Sensors for Sensors in July 2018')
plt.xlabel('Percent of Sensors Experiencing Outage')

In [39]:
#plot a hist of the % out within convex hull at a given time and zoom in 
les_than_20 = pw_ta[pw_ta['percent_out_within_outage'] < 20]

plt.hist(les_than_20['percent_out_within_outage'], bins=40)
plt.title('Percentage of Area Powered Within Convex Hull of Outage Sensors for Sensors in July 2018')
plt.xlabel('Percent of Sensors Experiencing Outage')

In [40]:
#let's zoom in on the on time frames that are experiencing less than 3% outages 
#note that this is bimodal and you can clearly see both low and medium voltage outages 
less_than_3_percent = pw_ta[pw_ta['percent_not_powered'] < 3]

plt.hist(less_than_3_percent['percent_not_powered'], bins=40)
plt.title('Percentage of Area not Powered for Sensors in July 2018')
plt.xlabel('Percent of Sensors Experiencing Outage')

In [24]:
#let's zoom in on the on time frames that are experiencing less than 1% outages 
less_than_3_percent = pw_ta[pw_ta['percent_not_powered'] < 1]

plt.hist(less_than_3_percent['percent_not_powered'])
plt.title('Percentage of Area not Powered for Sensors in July 2018')
plt.xlabel('Percent of Sensors Experiencing Outage')

In [25]:
#plot area out vs % out at a given time 

plt.scatter(pw_ta['convex_area_outage'], pw_ta['percent_not_powered'])
plt.title('Outage Area vs. Percent of Sensors Experiencing Outage')
plt.xlabel('Convex Area of Outage')
plt.ylabel('Percentage of Sensors Not Powered')

In [26]:
#Proportion of outages vs. time for week of 7/21/2017
#takes a long time to run 
#need to figure out how to fix labels on x-axis
week3= pw_ta[(pw_ta['time'] > '2018-07-21 00:00:00') & (pw_ta['time'] < '2018-07-28 00:00:00')]
plt.scatter(week3['time'], week3['percent_not_powered'])
plt.xlabel('time', labelpad=24)
plt.ylabel('percentage of sensors that were off')
plt.title('Percentage of outages vs. time for week of 7/21/2017')

In [27]:
july1 = pw[(pw['time'] >= '2018-07-01 00:00:00') & (pw['time'] <= '2018-07-02 00:00:00') ]
plt.scatter(july1['powered_longitude'], july1['powered_latitude'], c='b', label='powered')
plt.scatter(july1['outage_longitude'], july1['outage_latitude'], c='r', label='outage')
plt.title('Powered Sensors vs Outages: 2018-07-01')
plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')

In [28]:
#this plot does not take into account the regularity of the outages 
unique_outage_locations = pw.groupby(['outage_longitude', 'outage_latitude']).nunique()
density = unique_outage_locations['time'].reset_index().set_index('time')
sns.kdeplot(density['outage_longitude'], density['outage_latitude'], cmap='Reds', shade=True, label='outages')
plt.title('Kernel Density Estimation Plot of Outage Locations in July 2018')

In [29]:
outage_regularity = pw.groupby(['outage_longitude', 'outage_latitude'])['time'].unique().reset_index().reset_index()
time_series = outage_regularity.time.apply(pd.Series).reset_index()
outage_regularity = outage_regularity.merge(time_series, right_on='index', left_on='index').set_index('index').drop('time', axis=1)
outage_regularity = pd.melt(outage_regularity, id_vars=['outage_longitude', 'outage_latitude'], value_name='time').drop('variable', axis=1).dropna()
outage_regularity.head()

In [30]:
#this plot takes into account the regularity of the outages in July 
sns.kdeplot(outage_regularity['outage_longitude'], outage_regularity['outage_latitude'], cmap='Reds', shade=True, label='outages')
plt.title('Kernel Density Estimation Plot of the Frequency of Outages in July 2018')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

In [31]:
#let's zoom in on the most dense region at the top of the above plot 
sns.kdeplot(outage_regularity['outage_longitude'], outage_regularity['outage_latitude'], cmap='Reds', shade=True, label='outages')
plt.title('Kernel Density Estimation Plot of the Regularity of Outages in July 2018')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.ylim(5.665, 5.685)
plt.xlim(-0.295, -0.255)

In [32]:
#warning this takes a long time to run and the plot is too wild to derive much meaning from it 
sns.kdeplot(pw['powered_longitude'], pw['powered_latitude'], cmap='Blues')
plt.title('KDE Plot of Powered Sensors in July 2019')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

In [33]:
#let's plot the estimation of an outage at a specific time. Let's say on 2018-07-01 12:53:00
sns.kdeplot(time0['powered_longitude'], time0['powered_latitude'], cmap='Blues', label='powered')
sns.kdeplot(time0['outage_longitude'].unique(), time0['outage_latitude'].unique(), cmap='Reds', label='outage')
plt.title('KDE Plot of Powered Sensors vs. Outages on 2018-07-01 12:53:00')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

In [34]:
#let's plot the estimation of an outage at a specific day. Let's say on July 1 2018
#problem: it wont let me call .unique() on the powered sensors because there are multiple with the either the same lat or long coords 
#solution: try to implement something similar to what you did above for the cool KDE plot 
#also try to figure out the discrepancy that you see below this plot 
sns.kdeplot(*zip(*(july1['powered_pair'].unique())), cmap='Blues', label='powered')
sns.kdeplot(july1['outage_longitude'].unique(), july1['outage_latitude'].unique(), cmap='Reds', label='outage')
plt.title('KDE Plot of Powered Sensors vs. Outages on 2018-07-01')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

In [None]:
plt.scatter(*zip(*(july1['powered_pair'].unique())))

In [None]:
len(july1['powered_longitude'].unique())

In [None]:
len(july1)

In [None]:
len(july1['outage_latitude'].unique())

In [None]:
july1.groupby('time').nunique()

Next steps: 
    1) define a smaller bounding box of lat and long coords and plot within that region 
    2) figure out some analysis that can be done with pw_ta 
    3) make sure this is transferable to larger sets of data 
    4) learn GIS 
    5) remediate the overplotting phenomenon 
    6) look into making maps on geopandas 

In [None]:
#my way of manually checking if the point is inside the geometry 

in_lat = []
in_long = []
check_inside = []
for point in time8['powered_poly'].values[0]:
    long = point[0]
    lat = point[1]
    min_long = time8['geometry'].values[0].bounds[0]
    max_long = time8['geometry'].values[0].bounds[2]
    min_lat = time8['geometry'].values[0].bounds[1]
    max_lat = time8['geometry'].values[0].bounds[3]
    if (long < max_long) & (long > min_long): 
        if (lat < max_lat) & (lat > min_lat): 
            in_lat= np.append(in_lat, lat)
            in_long = np.append(in_long, long)

    in_lat = pd.Series(in_lat).unique()
    in_long = pd.Series(in_long).unique()

check_inside.append(list(zip(in_long, in_lat)))
check_inside = check_inside[0]

check_inside
    
#ok down here, we may need to zip together the outage_poly list with the new long/lat pairs 
#once you have a good 'check_inside' list, you can go ahead and confirm that the new convex hull is the same as the old