### Packages and Import

In [1]:
### Importing packages for data analysis

import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_curve, auc
import seaborn as sns

df_big = pd.read_csv("big_data.csv")
df_big = df_big.drop(columns=['Unnamed: 0'])
df_small = pd.read_csv("small_data.csv")
df_small = df_small.drop(columns=['Unnamed: 0'])

### Define X and Y, prep variables for OHE

In [2]:
X = df_big.drop(columns=['INJURY_BAD'])
y = df_big['INJURY_BAD']

In [3]:
cat_columns = ['age_bins', 'SAFE_PEDAL_ACTION', 'WEATHER_CAT', 'road_surf_bins',
              'SPEED_RATING', 'physical_bins', 'hour_bins', 'SEX']

### Train/Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=312)

### Preprocessing - OHE and SMOTEing

In [5]:
### OHE Encoding

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_encoded = ohe.fit_transform(X_train[cat_columns])
X_test_encoded = ohe.transform(X_test[cat_columns])

### Combine back together

X_train_processed = np.concatenate([X_train_encoded, X_train.drop(cat_columns, axis=1)], axis=1)
X_test_processed = np.concatenate([X_test_encoded, X_test.drop(cat_columns, axis=1)], axis=1)

### SMOTE the minority class

smote = SMOTE(random_state=312)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

### Fit to a decision tree

In [17]:
# Fit a decision tree classifier
tree = DecisionTreeClassifier(random_state=312, max_depth=10)
tree.fit(X_train_resampled, y_train_resampled)

DecisionTreeClassifier(max_depth=10, random_state=312)

### Vizualizing Tree

In [20]:
y_pred_train = tree.predict(X_train_resampled)

print(accuracy_score(y_train_resampled, y_pred_train))
print(recall_score(y_train_resampled, y_pred_train))
print(precision_score(y_train_resampled, y_pred_train))
print(f1_score(y_train_resampled, y_pred_train))

0.7883227640498579
0.6166630220861579
0.939060939060939
0.744456177402323


In [21]:
y_pred_test = tree.predict(X_test_processed)

print(accuracy_score(y_test, y_pred_test))
print(recall_score(y_test, y_pred_test))
print(precision_score(y_test, y_pred_test))
print(f1_score(y_test, y_pred_test))

0.854876615746181
0.07262569832402235
0.13829787234042554
0.09523809523809525


### Feature Importances
