In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

  from numpy.core.umath_tests import inner1d


In [2]:
data = pd.read_pickle('../../data/processed/all_samples.pickle')
data['datetime'] = pd.to_datetime(data.date)
data['day'] = data.datetime.dt.weekday_name
data = pd.get_dummies(data, prefix='day', columns=['day'])

In [3]:
features = ['hour',
            'daylight_yn',
            'holiday_yn',
            'rush_hour_yn',
            'temp',
            'wind_speed',
            'precipitation',
            'road_length',
            'class_freeway',
            'class_local',
            'class_major',
            'class_other',
            'class_unimproved',
            'day_Monday',
            'day_Tuesday',
            'day_Wednesday',
            'day_Thursday',
            'day_Friday',
            'day_Saturday',
            'day_Sunday']

labels = 'accident_yn'

In [4]:
X = data[features]
y = data[labels]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [8]:
f = open('../../data/processed/rf_random.pickle', 'rb')
rf = pickle.load(f)
f.close()

In [9]:
model = rf.best_estimator_

In [38]:
df = X_test.copy()

In [39]:
df['predicted'] = model.predict(X_test)

In [47]:
results = df.join(y_test)

In [55]:
df_compare = results.join(data[['segment_id', 'datetime']])

In [71]:
target_columns = ['segment_id',
                  'datetime',
                  'hour',
                  'daylight_yn',
                  'temp',
                  'wind_speed',
                  'predicted',
                  'accident_yn']

In [72]:
df_output = df_compare[target_columns].reset_index(drop=True)

In [73]:
rename_columns = ['segment_id',
                  'date',
                  'hour',
                  'daylight_yn',
                  'temp',
                  'wind_speed',
                  'predicted',
                  'actual']

In [74]:
df_output.columns = rename_columns

In [76]:
street_info = pd.read_pickle('../../data/interim/features/streets_by_nhood.pickle')

In [87]:
street_info = street_info.set_index('segment_id')

In [90]:
street_class = street_info[['class_Freeway', 'class_Local', 'class_Major Arterial', 'class_Other', 'class_Umimproved']].idxmax(axis=1)

In [94]:
street_class.name = 'class'

In [96]:
streets = street_info.join(street_class)

In [99]:
streets = streets[['fullname', 'class', 'shape_leng', 'geometry']]

In [107]:
streets['class'] = streets['class'].str[6:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [108]:
streets

Unnamed: 0_level_0,fullname,class,shape_leng,geometry
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
832.0_1.0,Avenue A,Local,933.169106,"(LINESTRING (-90.116510797142 30.008607463472,..."
832.0_2.0,Avenue B,Local,699.617547,(LINESTRING (-90.11746336900001 30.00866154986...
797.0_3.0,Rue Andr?e,Local,221.205674,LINESTRING (-89.99432737558099 29.915348012997...
797.0_4.0,Rue Colette,Local,449.876776,(LINESTRING (-89.99589918472201 29.91541454288...
797.0_5.0,Rue Delphine,Local,511.208548,LINESTRING (-90.00047936777899 29.917374968875...
797.0_6.0,Rue Denise,Local,451.475945,"LINESTRING (-89.996629116007 29.917237275266, ..."
832.0_7.0,Rue Le Ville,Local,136.086814,LINESTRING (-90.11534220490999 30.018289487643...
797.0_8.0,Rue Michelle,Local,451.588233,"LINESTRING (-89.998737710192 29.917325564985, ..."
797.0_9.0,Rue Mignon,Local,451.038027,LINESTRING (-89.99962584102499 29.917359156567...
797.0_10.0,Rue Nadine,Local,221.380264,"LINESTRING (-89.995119220647 29.91538009168, -..."


In [132]:
# True Positive
df_output.loc[(df_output['predicted'] == 1) & (df_output['actual'] == 1), 'prediction'] = 'True Positive'

# False Positive
df_output.loc[(df_output['predicted'] == 1) & (df_output['actual'] == 0), 'prediction'] = 'False Positive'

# False Negative
df_output.loc[(df_output['predicted'] == 0) & (df_output['actual'] == 1), 'prediction'] = 'False Negative'

# True Negative
df_output.loc[(df_output['predicted'] == 0) & (df_output['actual'] == 0), 'prediction'] = 'True Negative'

In [137]:
# True Positive
df_output['tp'] = np.where((df_output['predicted'] == 1) & (df_output['actual'] == 1), 1, 0)

# False Positive
df_output['fp'] = np.where((df_output['predicted'] == 1) & (df_output['actual'] == 0), 1, 0)

# False Negative
df_output['fn'] = np.where((df_output['predicted'] == 0) & (df_output['actual'] == 1), 1, 0)

# True Negative
df_output['tn'] = np.where((df_output['predicted'] == 0) & (df_output['actual'] == 0), 1, 0)

In [139]:
df_output.head()

Unnamed: 0,segment_id,date,hour,daylight_yn,temp,wind_speed,predicted,actual,prediction,tp,fp,fn,tn
0,812.0_2334.0,2012-11-27,21,0,52.0,9.0,1,1,True Positive,1,0,0,0
1,812.0_594.0,2017-02-04,4,0,46.0,13.0,0,0,True Negative,0,0,0,1
2,796.0_106.0,2012-02-18,17,1,58.0,14.0,0,0,True Negative,0,0,0,1
3,839.0_1034.0,2016-06-29,5,1,80.0,3.0,0,0,True Negative,0,0,0,1
4,839.0_2099.0,2017-09-29,10,1,85.0,6.0,1,1,True Positive,1,0,0,0


In [141]:
street_accident_predictions = pd.merge(streets, df_output, how='left', on='segment_id')

In [142]:
street_accident_predictions.columns

Index(['segment_id', 'fullname', 'class', 'shape_leng', 'geometry', 'date',
       'hour', 'daylight_yn', 'temp', 'wind_speed', 'predicted', 'actual',
       'prediction', 'tp', 'fp', 'fn', 'tn'],
      dtype='object')

In [159]:
street_date_groups = street_accident_predictions[['segment_id', 'date', 'tp', 'fp', 'fn', 'tn']]

In [160]:
street_date_groups['year'] = street_date_groups.date.dt.year
street_date_groups['month'] = street_date_groups.date.dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [164]:
street_date_groups = street_date_groups[['segment_id', 'tp', 'fp', 'fn', 'tn', 'year', 'month']]

In [172]:
street_predictions = street_date_groups[['segment_id', 'tp', 'fp', 'fn', 'tn']].groupby('segment_id').sum()

In [175]:
street_output = streets.join(street_predictions)

In [178]:
street_output = street_output.reset_index()

In [179]:
street_output.to_pickle('../../data/processed/street_predictions.pickle')