In [67]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

dtypes = {
    'COLLISION_ID' : 'int64',
    'BOROUGH' : 'string',
    'ZIP CODE' : 'string',
    'LATITUDE' : 'float64', 
    'LONGITUDE' : 'float64',
    'LOCATION' : 'string',
    'ON STREET NAME' : 'string',
    'CROSS STREET NAME' : 'string',
    'OFF STREET NAME' : 'string',
    'NUMBER OF PERSONS INJURED' : 'float64',
    'NUMBER OF PERSONS KILLED' : 'float64',
    'CONTRIBUTING FACTOR VEHICLE 1' : 'string'
}
crash_data = pd.read_csv('nyc_crashes.csv', usecols=["CRASH DATE", "CRASH TIME", "COLLISION_ID", "BOROUGH", "ZIP CODE", "LATITUDE", "LONGITUDE", "LOCATION", 
                                                     "ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME", "NUMBER OF PERSONS KILLED", 
                                                     "NUMBER OF PERSONS INJURED", "CONTRIBUTING FACTOR VEHICLE 1"] ,dtype=dtypes)

crash_data['CRASH DATE'] = pd.to_datetime(crash_data['CRASH DATE'])
crash_data['CRASH TIME'] = pd.to_datetime(crash_data['CRASH TIME'], format='%H:%M').dt.time

# TOTAL CRASHES GROUPED BY LONGITUDE AND LATITUDE
# crash_data['NUMBER OF CRASHES'] = 1
# lat_long_crashes = crash_data.dropna(subset = ['LATITUDE','LONGITUDE'])
# lat_long_crashes = crash_data.groupby(['LATITUDE', 'LONGITUDE'])['NUMBER OF CRASHES'].sum().reset_index()

nyc_lat_min, nyc_lat_max = 40.4774, 40.9176
nyc_lon_min, nyc_lon_max = -74.2591, -73.7004

# X and y dataset
Xandy = crash_data[['LATITUDE', 'LONGITUDE' ,'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 'CONTRIBUTING FACTOR VEHICLE 1']]
Xandy = Xandy.dropna()
#remove latitude and longitude that contains 0
Xandy = Xandy[~((Xandy['LATITUDE'] == 0.000000) & (Xandy['LONGITUDE'] == 0.000000))]
#remove latitude and longitude not contained in NYC area
Xandy = Xandy[(Xandy['LATITUDE'] >= nyc_lat_min) & (Xandy['LATITUDE'] <= nyc_lat_max) &
        (Xandy['LONGITUDE'] >= nyc_lon_min) & (Xandy['LONGITUDE'] <= nyc_lon_max)]

# Decision Tree Algorithm
model = DecisionTreeClassifier()

# Predictions for injured and killed
# Dataset for injured and killed
injured_killed = Xandy[['LATITUDE', 'LONGITUDE', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']]
injured_killed = injured_killed.groupby(['LATITUDE', 'LONGITUDE']).sum().reset_index()

# Input dataset for injured and killed
X = injured_killed.drop(columns=['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED'])

# Output dataset for injured
y_injured = injured_killed['NUMBER OF PERSONS INJURED']

# Output dataset for killed
y_killed = injured_killed['NUMBER OF PERSONS KILLED']

# Predict injured dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y_injured, test_size=0.2)
# model.fit(X_train, y_train)
# predictions = model.predict(X_test)
# injured_score = accuracy_score(y_test, predictions)
# print(injured_score)

# # Predict killed dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y_killed, test_size=0.2)
# model.fit(X_train, y_train)
# predictions = model.predict(X_test)
# killed_score = accuracy_score(y_test, predictions)
# print(killed_score)

# Predictions for contributing factor
# Dataset for contributing factor
contributing_factor = Xandy[['LATITUDE', 'LONGITUDE', 'CONTRIBUTING FACTOR VEHICLE 1']]
# Remove contributing factors that are unspecified
contributing_factor = contributing_factor[contributing_factor['CONTRIBUTING FACTOR VEHICLE 1'].str.contains('Unspecified') == False]
contributing_factor = contributing_factor.groupby(['LATITUDE', 'LONGITUDE'])['CONTRIBUTING FACTOR VEHICLE 1'].agg(
                                                lambda x: x.mode()[0]).reset_index()
# Rename column into primary cause of crash
contributing_factor.rename(columns={'CONTRIBUTING FACTOR VEHICLE 1': 'PRIMARY CAUSE OF CRASH'}, inplace=True)

print(contributing_factor)

# Input dataset for contributing factors
X = contributing_factor.drop(columns=['PRIMARY CAUSE OF CRASH'])

# Output dataset for contributing factors
y_contributingfactor = contributing_factor['PRIMARY CAUSE OF CRASH']

# # Predict contributing factors dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y_contributingfactor, test_size=0.2)
# model.fit(X_train, y_train)
# predictions = model.predict(X_test)
# contributing_factor_score = accuracy_score(y_test, predictions)
# print(contributing_factor_score)

# for column in crash_data.columns:
#     print(column, ':', pd.api.types.infer_dtype(crash_data[column]))

         LATITUDE  LONGITUDE          PRIMARY CAUSE OF CRASH
0       40.499479 -74.241728             Alcohol Involvement
1       40.499672 -74.237915                    Unsafe Speed
2       40.499840 -74.239920   Failure to Yield Right-of-Way
3       40.499842 -74.239917                 Fatigued/Drowsy
4       40.500023 -74.239020  Driver Inattention/Distraction
...           ...        ...                             ...
225070  40.912170 -73.900770                    Unsafe Speed
225071  40.912223 -73.901674   Failure to Yield Right-of-Way
225072  40.912468 -73.902596             Alcohol Involvement
225073  40.912468 -73.902589  Driver Inattention/Distraction
225074  40.912884 -73.902500  Driver Inattention/Distraction

[225075 rows x 3 columns]
