In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
from scipy.stats import ttest_ind
from scipy.spatial import ConvexHull
from scipy.spatial import Delaunay
import hdbscan

In [94]:
# df=pd.read_csv('data/2014/accident.csv')

In [211]:
# df.info()

In [96]:
# def getDistFromRow(dftmp,loc):
#     location=['LONGITUD','LATITUDE']
#     return np.linalg.norm(dftmp.loc[:,location]-loc,axis=1)
# 
# def LookupLocations(state,county,city,dflookup):
#     if dflookup is None:
#         dflookup=pd.read_csv(basefile,sep='|')
#     return dflookup.loc[(dflookup.STATE_NUMERIC==state) & (dflookup.COUNTY_NUMERIC==county),['PRIM_LONG_DEC','PRIM_LAT_DEC']].median()
# def fillLocations(dftmp,basefile='data/NationalFile_20190501.txt'):
#     dflup=pd.read_csv(basefile,sep='|')
#     state=dftmp.columns.tolist('STATE')
#     county=dftmp.columns.tolist('COUNTY')
#     city=dftmp.columns.tolist('CITY')
#     for i,v in dftmp.loc[(dftmp.LATITUDE>=99) & (dftmp.LONGITUD>=999),:].iterrows():
#         loc=LookupLocations(state,county,city,dflup)

### Helper Function Below

In [208]:
# Data Clean up - ONLY THESE FUNCTIONS ARE USED


def cleanLocs(dftmp):
    return dftmp.loc[(dftmp.latitude<=77) & (dftmp.longitud<=999),:]

def getalldata(filename='accident.csv'):
    years=range(2015,2018)
    dftmp=pd.read_csv('data/2014/'+filename)
    dftmp['id']=dftmp.ST_CASE.apply(lambda x: "2014."+str(int(x)))
    for i in years:
        dfz=pd.read_csv('data/'+str(i)+'/'+filename)
        dfz['id']=dfz.ST_CASE.apply(lambda x: str(i) + "." + str(int(x)))
        dftmp=dftmp.append(dfz,sort=False) 
    dftmp.columns=[ i.lower() for i in dftmp.columns.tolist()]
    return dftmp


# Please ignore this function for now - need to be edited to figure out how we'll deal with multiple columns
def mergedata(dfmain,filename='distract.csv',columns=[]):
    years=range(2015,2018)
    dftmp=pd.read_csv('data/2014/'+filename.lower())
    dftmp.columns=[i.lower for i in dftmp.columns.tolist()]
    dftmp['id']=dftmp.ST_CASE.apply(lambda x: "2014."+str(int(x)))
    for i in years:
        dfz=pd.read_csv('data/'+str(i)+'/'+filename)
        dfz['id']=dfz.ST_CASE.apply(lambda x: str(i) + "." + str(int(x)))
        dftmp=dftmp.append(dfz)
    dfmain=dfmain.set_index('id')
    dftmp=dftmp.set_index('id')
    return dfmain.merge(dftmp)


def cyclic(dftmp,columns=['ARR_HOUR'],periods=[24]):
    for c,p in zip(columns,periods):
        tgt=dftmp[c]
        if min(tgt)!=0:
            tgt=tgt-min(tgt)
        dftmp[c+'_sin_cycle']=np.sin(2*np.pi*tgt/p)
        dftmp[c+'_cos_cycle']=np.cos(2*np.pi*tgt/p)
        
def create_plot_labels(dforig):
    dftmp=dforig.copy()
    dftmp['plot_labels']=dftmp.sig*dftmp.cluster
    return dftmp

def removeNoHourAndMinutes(dforig):
    dftmp=dforig.copy()
    crit=(dftmp.hour<24) & (dftmp.minute<60)
    dftmp=dftmp.loc[crit,:]
    return dftmp

def createTimestamp(dforig):
    dftmp=dforig.copy()
    dftmp['tstamp']=[pd.Timestamp(year=v.year,month=v.month,day=v.day,hour=v.hour,minute=v.minute) for i,v in df.iterrows()]
    return dftmp
    
def removeUneededAccidentColumns(dforig):
    dftmp=dforig.copy()
    return dftmp.loc[:,['id','tstamp','state','longitud','latitude','peds','day','month','year','day_week',
                         'hour', 'minute','route','harm_ev','man_coll','reljct2','typ_int',
                         'lgt_cond','weather','cf1','drunk_dr']]

def calculateTopNCatPct(dforig,N=5):
    l=[]
    DO_NOT_CALC=['id','tstamp','longitud','latitude','day','month','year','day_week','hour','minute']
    for i in dforig.columns.tolist():
        if i not in DO_NOT_CALC:
            l.append([i,(dforig[i].value_counts()/dforig.shape[0])[0:N].sum()])
    return pd.DataFrame(l,columns=['variable','percent'])

def binarizeVariables(dforig,variable_list=['peds','route'],TopN=5,drop_original_variables=True):
    dftmp=dforig.copy()
    for col in variable_list:
        for value in dftmp[col].value_counts().index[0:TopN].tolist():
            dftmp[col+'_'+str(value)]= dftmp[col]==value
    if drop_original_variables:
        dftmp=dftmp.drop(variable_list,axis=1)
    return dftmp

In [None]:
# Clustering Functions

class convexhull(ConvexHull):
    def in_hull(self,p):
        """
        Test if points in `p` are in `hull`

        `p` should be a `NxK` coordinates of `N` points in `K` dimensions
        `hull` is either a scipy.spatial.Delaunay object or the `MxK` array of the 
        coordinates of `M` points in `K`dimensions for which Delaunay triangulation
        will be computed
        """
        hull=Delaunay(self.points)

        return hull.find_simplex(p)>=0

def cluster_all_points(dfsrc,filter_rows,LongitudeLatitude=LOCS,hdbscan_params={'min_cluster_size':30,'gen_min_span_tree':True, 'metric':'manhattan'}):
    '''
    This function takes a table with longitude, latitude, clusters them using hdbscan, then generate the cluster boundaries
    using a convex hull and label all points in the convex hull to the appropriate cluster. Note that cluster = -1 indicates that it is not
    in a cluster. The returned dataframe will have 1 additonal field - 'cluster', indicating the cluster it is in.
    
    dfsrc - the pandas datframe will all the information
    filter_rows- a boolean list that should have the same number of data points indicating which rows are considered "Active"
                (ie. if in the dataframe of dfsrc, if a row is considered "on", then it should be True, else False)
    LongitudeLatitude - list of [longitude, latitude] parameter
    hdbscan - the list of parametrs for hdbscan
    
    Example - dfret=cluster_all_points(df,df.WK_ZONE==1)
    '''
    import hdbscan
    dftmp=dfsrc.copy()
    dftgt=dftmp.loc[filter_rows,LOCS]
    clusterer = hdbscan.HDBSCAN(**hdbscan_params)
    clusterer.fit(dftgt)
    dftgt['cluster']=clusterer.labels_ 
    dftmp['cluster']=-1
    dftmp.loc[list(dftgt.index),'cluster']=dftgt['cluster']
    # Note that at this point, only the points found to be in a cluster and the filter_row == TRUE have an active cluster number, not we need to find
    # clusters for points which have filter_row == FALSE but is inside the cluster. We do this using a convex hull.
    for i in dftmp['cluster'].unique():
        if i!=-1:
            dftgt=dftmp.loc[dftmp['cluster']==i,LongitudeLatitude]
            hull=convexhull(dftgt)
            hull.close()
            dftmp.loc[hull.in_hull(dftmp.loc[:,LongitudeLatitude].values),'cluster']=i
    # Now, all the points of the dataframe are labeled
    return dftmp

from scipy.stats import ttest_ind
def ttest(dfsrc,label,clusterfield='cluster',notinclusternumber=-1):
    '''
    This function takes a pandas dataframe with at least 1 field - clusterfield (the field indicating the cluster which the
    data point is in). The it performs a ttest for significance for each cluster against data in non-clusters and returns
    the tstat and pvalue for the test in addtional fields. For each cluster, it will have the same tstat and pvalue. Note that
    the cluster is only tested again points not in cluster (ie. cluster = -1). The worries about validity of the t-test if some
    data points are int the significance range should be assuaged by the argument that if a given cluster is deemed insignificant,
    then the cluster shouldn't be different mean the non-clustered; therefore, the test against a sample of the total population
    is still valid (similar to bootstrapping...)

    dfsrc - pandas dataframe
    label - list of booleans - should be the same length as dfsrc - it labels the data points to be true or not true
    clusterfield - the field parameter to use if the cluster field is not cluster
    notinclusternumber - the cluster number to use to indicate in the clusterfield that it's not in a cluster
    
    Example: ttest(df,df.WRK_ZONE==1)
    '''
    dftmp=dfsrc.copy()
    dftmp['label']=label
    dfnotincluster=dftmp.loc[dftmp[clusterfield]==notinclusternumber,'label']
    dftmp['tstat']=-1
    dftmp['pvalue']=-1
    for i in dftmp[clusterfield].unique():
        if i!=notinclusternumber:
            dfcluster=dftmp.loc[dftmp[clusterfield]==i,'label']
            tstat,pvalue=ttest_ind(dfcluster,dfnotincluster)
            dftmp.loc[dftmp[clusterfield]==i,['tstat']]=tstat
            dftmp.loc[dftmp[clusterfield]==i,['pvalue']]=pvalue
    return dftmp

def signficant_clusters(dfsrc,number_of_hypothesis=1, alpha=0.05):
    '''
    The function returns a dataframe with the field 'pvalue'. It returns a dataframe with a boolean field 'b_sig' indicating if it's significant for not.
    The function adjusts for bonferroni correction.
    
    dfsrc - the dataframe
    run_number - what test number is this (used for bonferroni correction)
    alpha - alpha level
    
    Addendum:
    With respect to FWER (family wise error rate) control, the Bonferroni correction can be conservative if there are a large number of tests 
    and/or the test statistics are positively correlated.
    
    The correction comes at the cost of increasing the probability of producing false negatives, i.e., reducing statistical power.[9]

    There is not a definitive consensus on how to define a family in all cases, and adjusted test results may vary depending on the 
    number of tests included in the family of hypotheses.[citation needed]

    Note that these criticisms apply to FWER control in general, and are not specific to the Bonferroni correction.
    
    We may want to look into False Discovery Rate (Benjamini-Hochberg Procedure) correction procedure if bonferroni is too conservative and the
    test are correlated with each other
    
    https://www.statisticshowto.datasciencecentral.com/benjamini-hochberg-procedure/

    '''
    dftmp=dfsrc.copy()
    bonferroni_correction=alpha/number_of_hypothesis
    alpha_critical=1-(1-bonferroni_correction)**number_of_hypothesis
    dftmp['sig']=dftmp['pvalue']<=alpha_critical
    return dftmp

### Binarize, cluster, and test for significance example - workflow

In [218]:
df=getalldata()
df=removeNoHourAndMinutes(df)
df=removeUneededAccidentColumns(df)
df=cleanLocs(df)
df=createTimestamp(df)

In [219]:
calculateTopNCatPct(df)

Unnamed: 0,variable,percent
0,state,0.360543
1,peds,0.999492
2,route,0.909624
3,harm_ev,0.732488
4,man_coll,0.982728
5,reljct2,0.968195
6,typ_int,0.9974
7,lgt_cond,0.990491
8,weather,0.978527
9,cf1,0.977035


In [221]:
df=binarizeVariables(df)
df.head()

Unnamed: 0,id,tstamp,state,longitud,latitude,peds,day,month,year,day_week,...,peds_0,peds_1,peds_2,peds_3,peds_4,route_3,route_6,route_2,route_4,route_1
0,2014.10001,2014-01-01 01:15:00,1,-86.0,35.0,0,1,1,2014,4,...,True,False,False,False,False,False,False,False,True,False
1,2014.10002,2014-01-01 13:30:00,1,-88.0,34.0,0,1,1,2014,4,...,True,False,False,False,False,False,False,False,True,False
2,2014.10003,2014-01-01 03:07:00,1,-88.0,33.0,0,1,1,2014,4,...,True,False,False,False,False,False,False,True,False,False
3,2014.10004,2014-01-02 09:00:00,1,-86.0,33.0,0,2,1,2014,5,...,True,False,False,False,False,False,False,True,False,False
4,2014.10005,2014-01-02 16:30:00,1,-87.0,34.0,0,2,1,2014,5,...,True,False,False,False,False,False,False,False,True,False


In [None]:
hdbscan_params={'min_cluster_size':100,'gen_min_span_tree':True, 'metric':'manhattan'}
dfwk=cluster_all_points(df,df.peds_0==1,hdbscan_params=hdbscan_params)
dfwk=ttest(dfwk,dfwk.peds_0==1)
dfwk=signficant_clusters(dfwk,1)

In [213]:
### End Example

Unnamed: 0,variable,percent
0,state,0.360543
1,peds,0.999492
2,route,0.909624
3,harm_ev,0.732488
4,man_coll,0.982728
5,reljct2,0.968195
6,typ_int,0.9974
7,lgt_cond,0.990491
8,weather,0.978527
9,cf1,0.977035


In [17]:
df=getalldata()
df=mergedata(df)
df=cleanDFLocs(df)

In [18]:
from bokeh.sampledata import us_states
import bokeh.models as bmo
from bokeh.palettes import d3
import bokeh.plotting as bpl
import bokeh.models as bmo
from bokeh.palettes import d3

bpl.output_notebook()

def plot_map(dfsrc,color='navy',title="US Scatter Map"):
   
    dftmp=dfsrc.copy()
   
    if type(color) is not str:
        dftmp['cluster']=[str(i) for i in color]
        
    source = bpl.ColumnDataSource(dftmp)

    us_st = us_states.data.copy()

    # del us_states["HI"]
    # del us_states["AK"]

    # separate latitude and longitude points for the borders
    #   of the states.
    state_xs = [us_st[code]["lons"] for code in us_st]
    state_ys = [us_st[code]["lats"] for code in us_st]

    # init figure
    p = bpl.figure(title=title, 
               toolbar_location="left", plot_width=1100, plot_height=700, x_range=(-130,-65),y_range=(22,53))

    # Draw state lines
    p.patches(state_xs, state_ys, fill_alpha=0.0,
        line_color="#884444", line_width=1.5)

    if len(dftmp['cluster'].unique())==1:
        color='navy'
    
    # The scatter markers
    if type(color) is not str:
        palette = d3['Category10'][len(dftmp['cluster'].unique())]
        color_map = bmo.CategoricalColorMapper(factors=dftmp['cluster'].unique(), palette=palette)
        p.scatter(x='longitud',y='latitude',color={'field':'cluster','transform':color_map},legend='cluster',source=source,alpha=0.5)
    else:
        p.scatter(x='logitude',y='latitude',color=color,source=source,alpha=0.5)
    bpl.show(p)

In [20]:
# plot_map(df)

In [21]:
LOCS=['logitude','latitude']
#plot_map(df.loc[df.accident_WRK_ZONE==1,LOCS])

In [26]:
import hdbscan

dfwk=df.loc[df.WRK_ZONE==1,LOCS]
clusterer = hdbscan.HDBSCAN(min_cluster_size=300, gen_min_span_tree=True,metric='manhattan')
clusterer.fit(dfwk)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=True, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='manhattan', min_cluster_size=300, min_samples=None, p=None,
    prediction_data=False)

In [27]:
plot_map(dfwk,clusterer.labels_,title='Fatal Accident Road condition Clusters')

In [28]:
def cluster_all_points(dfsrc,filter_rows,LongitudeLatitude=LOCS,hdbscan_params={'min_cluster_size':30,'gen_min_span_tree':True, 'metric':'manhattan'}):
    '''
    This function takes a table with longitude, latitude, clusters them using hdbscan, then generate the cluster boundaries
    using a convex hull and label all points in the convex hull to the appropriate cluster. Note that cluster = -1 indicates that it is not
    in a cluster. The returned dataframe will have 1 additonal field - 'cluster', indicating the cluster it is in.
    
    dfsrc - the pandas datframe will all the information
    filter_rows- a boolean list that should have the same number of data points indicating which rows are considered "Active"
                (ie. if in the dataframe of dfsrc, if a row is considered "on", then it should be True, else False)
    LongitudeLatitude - list of [longitude, latitude] parameter
    hdbscan - the list of parametrs for hdbscan
    
    Example - dfret=cluster_all_points(df,df.WK_ZONE==1)
    '''
    import hdbscan
    dftmp=dfsrc.copy()
    dftgt=dftmp.loc[filter_rows,LOCS]
    clusterer = hdbscan.HDBSCAN(**hdbscan_params)
    clusterer.fit(dftgt)
    dftgt['cluster']=clusterer.labels_ 
    dftmp['cluster']=-1
    dftmp.loc[list(dftgt.index),'cluster']=dftgt['cluster']
    # Note that at this point, only the points found to be in a cluster and the filter_row == TRUE have an active cluster number, not we need to find
    # clusters for points which have filter_row == FALSE but is inside the cluster. We do this using a convex hull.
    for i in dftmp['cluster'].unique():
        if i!=-1:
            dftgt=dftmp.loc[dftmp['cluster']==i,LongitudeLatitude]
            hull=convexhull(dftgt)
            hull.close()
            dftmp.loc[hull.in_hull(dftmp.loc[:,LongitudeLatitude].values),'cluster']=i
    # Now, all the points of the dataframe are labeled
    return dftmp


In [35]:
from scipy.stats import ttest_ind
def ttest(dfsrc,label,clusterfield='cluster',notinclusternumber=-1):
    '''
    This function takes a pandas dataframe with at least 1 field - clusterfield (the field indicating the cluster which the
    data point is in). The it performs a ttest for significance for each cluster against data in non-clusters and returns
    the tstat and pvalue for the test in addtional fields. For each cluster, it will have the same tstat and pvalue. Note that
    the cluster is only tested again points not in cluster (ie. cluster = -1). The worries about validity of the t-test if some
    data points are int the significance range should be assuaged by the argument that if a given cluster is deemed insignificant,
    then the cluster shouldn't be different mean the non-clustered; therefore, the test against a sample of the total population
    is still valid (similar to bootstrapping...)

    dfsrc - pandas dataframe
    label - list of booleans - should be the same length as dfsrc - it labels the data points to be true or not true
    clusterfield - the field parameter to use if the cluster field is not cluster
    notinclusternumber - the cluster number to use to indicate in the clusterfield that it's not in a cluster
    
    Example: ttest(df,df.WRK_ZONE==1)
    '''
    dftmp=dfsrc.copy()
    dftmp['label']=label
    dfnotincluster=dftmp.loc[dftmp[clusterfield]==notinclusternumber,'label']
    dftmp['tstat']=-1
    dftmp['pvalue']=-1
    for i in dftmp[clusterfield].unique():
        if i!=notinclusternumber:
            dfcluster=dftmp.loc[dftmp[clusterfield]==i,'label']
            tstat,pvalue=ttest_ind(dfcluster,dfnotincluster)
            dftmp.loc[dftmp[clusterfield]==i,['tstat']]=tstat
            dftmp.loc[dftmp[clusterfield]==i,['pvalue']]=pvalue
    return dftmp

In [36]:
def signficant_clusters(dfsrc,number_of_hypothesis=1, alpha=0.05):
    '''
    The function returns a dataframe with the field 'pvalue'. It returns a dataframe with a boolean field 'b_sig' indicating if it's significant for not.
    The function adjusts for bonferroni correction.
    
    dfsrc - the dataframe
    run_number - what test number is this (used for bonferroni correction)
    alpha - alpha level
    
    Addendum:
    With respect to FWER (family wise error rate) control, the Bonferroni correction can be conservative if there are a large number of tests 
    and/or the test statistics are positively correlated.
    
    The correction comes at the cost of increasing the probability of producing false negatives, i.e., reducing statistical power.[9]

    There is not a definitive consensus on how to define a family in all cases, and adjusted test results may vary depending on the 
    number of tests included in the family of hypotheses.[citation needed]

    Note that these criticisms apply to FWER control in general, and are not specific to the Bonferroni correction.
    
    We may want to look into False Discovery Rate (Benjamini-Hochberg Procedure) correction procedure if bonferroni is too conservative and the
    test are correlated with each other
    
    https://www.statisticshowto.datasciencecentral.com/benjamini-hochberg-procedure/

    '''
    dftmp=dfsrc.copy()
    bonferroni_correction=alpha/number_of_hypothesis
    alpha_critical=1-(1-bonferroni_correction)**number_of_hypothesis
    dftmp['sig']=dftmp['pvalue']<=alpha_critical
    return dftmp

In [214]:
# Workflow example
# z=cluster_all_points(df,df.WRK_ZONE==1)
# z=ttest(z,z.WRK_ZONE==1)
# z=signficant_clusters(z,2)

In [215]:
# z.cluster.value_counts()

## Workflow Example

In [216]:
df=getalldata()
df=mergedata(df)
df=cleanDFLocs(df)


FileNotFoundError: [Errno 2] File b'data/2015/distract.csv' does not exist: b'data/2015/distract.csv'

In [None]:
dfwk.cluster.value_counts()

In [None]:
dfwk=create_plot_labels(dfwk)

In [None]:
plot_map(dfwk,dfwk.plot_labels,title='Fatal Accident Road condition Clusters')

In [57]:
dfwk.head().T

Unnamed: 0,0,1,2,3,4
ST_CASE,10001,10001,10001,10001,10001
accident_ARR_HOUR,1,1,1,1,2
accident_ARR_MIN,35,35,35,35,58
accident_CF1,0,0,0,0,0
accident_CF2,0,0,0,0,0
accident_CF3,0,0,0,0,0
accident_CITY,0,0,0,0,0
accident_COUNTY,71,71,71,71,127
accident_DAY,1,1,1,1,1
accident_DAY_WEEK,4,4,4,4,5


## End Example

In [122]:
#plot_map(dftmp,dftmp.cluster)

In [125]:
z=dfwk.loc[dfwk.cluster==3,LOCS]
hull=convexhull(z)
#hull=spatial.ConvexHull(z)
hull.close()

In [126]:
points=z.loc[:,LOCS].values

In [127]:
# plt.plot(points[hull.vertices,0], points[hull.vertices,1], 'r--', lw=2)
# plt.plot(points[hull.vertices[0],0], points[hull.vertices[0],1], 'ro')
# plt.plot(points[:,0],points[:,1],'bo')
# plt.show()

In [128]:
z=df.loc[hull.in_hull(df.loc[:,LOCS].values),:]

In [129]:
from scipy.stats import ttest_ind

In [137]:
ttest_ind(z.accident_WRK_ZONE,dfwk.loc[dfwk.cluster==-1,'accident_WRK_ZONE'])

KeyError: 'accident_WRK_ZONE'

In [144]:
LOCS=['accident_LONGITUD', 'accident_LATITUDE','cluster']
z.loc[:,LOCS].head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,accident_LONGITUD,accident_LATITUDE,cluster
0,-85.981408,34.623722,
1,-87.772117,34.397428,
2,-87.525911,33.197172,
3,-86.307831,33.196383,
4,-86.784592,34.180189,


In [139]:
df.loc[:,LOCS].head()

Unnamed: 0,accident_LONGITUD,accident_LATITUDE
0,-85.981408,34.623722
1,-87.772117,34.397428
2,-87.525911,33.197172
3,-86.307831,33.196383
4,-86.784592,34.180189


In [102]:
a=df.loc[~hull.in_hull(df.loc[:,LOCS].values),:]

In [103]:
a

Unnamed: 0,accident_STATE,ST_CASE,accident_VE_TOTAL,accident_VE_FORMS,accident_PVH_INVL,accident_PEDS,accident_PERNOTMVIT,accident_PERMVIT,accident_PERSONS,accident_COUNTY,...,accident_NOT_MIN,accident_ARR_HOUR,accident_ARR_MIN,accident_HOSP_HR,accident_HOSP_MN,accident_CF1,accident_CF2,accident_CF3,accident_FATALS,accident_DRUNK_DR
0,1.0,10001.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,71.0,...,99.0,1.0,35.0,99.0,99.0,0.0,0.0,0.0,1.0,1.0
1,1.0,10002.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,59.0,...,99.0,13.0,50.0,99.0,99.0,0.0,0.0,0.0,1.0,1.0
2,1.0,10003.0,2.0,2.0,0.0,0.0,0.0,7.0,7.0,125.0,...,99.0,3.0,10.0,99.0,99.0,0.0,0.0,0.0,2.0,0.0
3,1.0,10004.0,3.0,3.0,0.0,0.0,0.0,5.0,5.0,121.0,...,99.0,9.0,15.0,99.0,99.0,0.0,0.0,0.0,1.0,0.0
4,1.0,10005.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,43.0,...,99.0,12.0,45.0,88.0,88.0,0.0,0.0,0.0,1.0,0.0
5,1.0,10006.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,35.0,...,99.0,6.0,15.0,88.0,88.0,0.0,0.0,0.0,1.0,0.0
6,1.0,10007.0,3.0,2.0,1.0,0.0,0.0,2.0,2.0,97.0,...,99.0,18.0,48.0,88.0,88.0,0.0,0.0,0.0,1.0,1.0
7,1.0,10008.0,2.0,2.0,0.0,0.0,0.0,4.0,4.0,13.0,...,99.0,6.0,30.0,99.0,99.0,0.0,0.0,0.0,1.0,0.0
8,1.0,10009.0,3.0,3.0,0.0,0.0,0.0,5.0,5.0,127.0,...,99.0,15.0,51.0,88.0,88.0,0.0,0.0,0.0,1.0,0.0
9,1.0,10010.0,2.0,2.0,0.0,0.0,0.0,6.0,6.0,101.0,...,99.0,18.0,2.0,99.0,99.0,0.0,0.0,0.0,2.0,0.0


In [110]:
df.loc[~hull.in_hull(df.loc[:,LOCS].values),'accident_WRK_ZONE'].mean()

0.03680029558470349

In [32]:
~df.distract_MDRDSTRD.isna()

0          True
1         False
2         False
3         False
4          True
5         False
6         False
7         False
8          True
9         False
10        False
11        False
12         True
13        False
14        False
15        False
16         True
17        False
18        False
19        False
20         True
21        False
22        False
23        False
24         True
25        False
26        False
27        False
28         True
29        False
          ...  
762718    False
762719    False
762720    False
762721    False
762722    False
762723    False
762724    False
762725    False
762726    False
762727    False
762728    False
762729    False
762730    False
762731    False
762732    False
762733    False
762734    False
762735    False
762736    False
762737    False
762738    False
762739    False
762740    False
762741    False
762742    False
762743    False
762744    False
762745    False
762746    False
762747    False
Name: distract_MDRDSTRD,

10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10001.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10002.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10003.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10004.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10005.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10006.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0
10007.0



KeyboardInterrupt



In [64]:
dfwk.plot_labels.value_counts()

-1     243657
 0     129906
 9     108548
 6      37293
 18     33936
 3      32556
 23     31629
 21     23022
 25     22222
 15     20722
 12     15998
 17     15622
 14     15566
 4      11938
 10      9873
 11      7215
Name: plot_labels, dtype: int64