# Initial Exploration

Over 300 columns, and over 600k rows of accident reports. Here I will outline my work towards understanding the reports and test ideas as they come.

In [1]:
%%capture
%load_ext sql
%sql sqlite:///../data/interim/collisions.db

In [2]:
%%sql
ped_acc <<
SELECT DEC_LAT, DEC_LONG, PED_DEATH_COUNT, PED_COUNT, COUNTY
FROM CRASH
WHERE PED_COUNT > 0

 * sqlite:///../data/interim/collisions.db
Done.
Returning data to local variable ped_acc


In [3]:
import pandas

df = ped_acc.DataFrame()
df.shape

(86185, 5)

In [4]:
import plotly.express as px

fig = px.density_mapbox(df, lat='DEC_LAT', lon='DEC_LONG', z='PED_DEATH_COUNT',
                        hover_name='COUNTY', opacity=0.5)
fig.update_layout(mapbox_style="open-street-map")
fig.show()

## What are some predictors that an accident involving a pedestrian becomes fatal?

In [5]:
%%sql
pred_flags <<
SELECT CRASH.PED_DEATH_COUNT, FLAG.*
FROM CRASH
INNER JOIN FLAG ON CRASH.CRN=FLAG.CRN

 * sqlite:///../data/interim/collisions.db
Done.
Returning data to local variable pred_flags


In [6]:
flags = pred_flags.DataFrame()
flags.shape

(2534904, 112)

In [7]:
flags.dropna(axis='index', how='any', inplace=True)
flags.isnull().sum().sum()

0

In [8]:
target = 'PED_DEATH_COUNT'

X = flags.drop(columns=[target])
y = flags[target] > 0

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [11]:
from sklearn.metrics import f1_score

f1_score(y_test, logreg.predict(X_test))

0.0