In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
columns = [
    "Severity", "State", "Temperature(F)", "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)", 
    "Weather_Condition", "Sunrise_Sunset", "year", "date"
]

target = ["Severity"]

In [5]:
# Load the data
file_path = Path('PreCOVID_accidents.csv', index_col=False)
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()
df.head()

Unnamed: 0,Severity,State,Temperature(F),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Sunrise_Sunset,year,date
0,long_delay,NJ,89.0,10.0,5.0,0.0,clear_weather,Day,2019,10/2/19
1,long_delay,LA,54.0,10.0,5.0,0.0,clear_weather,Day,2019,11/1/19
2,long_delay,AZ,41.0,10.0,10.0,0.0,clear_weather,Night,2020,2/16/20
3,long_delay,TX,79.0,10.0,25.0,0.0,bad_weather,Day,2019,6/4/19
4,long_delay,VA,37.0,10.0,0.0,0.0,clear_weather,Night,2019,10/19/19


In [6]:
# Check dtypes, factors should be float64
df.dtypes


Severity              object
State                 object
Temperature(F)       float64
Visibility(mi)       float64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Weather_Condition     object
Sunrise_Sunset        object
year                   int64
date                  object
dtype: object

In [7]:
# Get the number of rows
df.shape[0]

277041

In [8]:
# Drop NA rows; making sure the clean data dropped NAs
df = df.dropna(axis=0, how="any")
df.head()

Unnamed: 0,Severity,State,Temperature(F),Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Sunrise_Sunset,year,date
0,long_delay,NJ,89.0,10.0,5.0,0.0,clear_weather,Day,2019,10/2/19
1,long_delay,LA,54.0,10.0,5.0,0.0,clear_weather,Day,2019,11/1/19
2,long_delay,AZ,41.0,10.0,10.0,0.0,clear_weather,Night,2020,2/16/20
3,long_delay,TX,79.0,10.0,25.0,0.0,bad_weather,Day,2019,6/4/19
4,long_delay,VA,37.0,10.0,0.0,0.0,clear_weather,Night,2019,10/19/19


In [9]:
# Check the number of rows again; Data cleanup already dropped NAs
df.shape[0]

277041

In [10]:
# Get count of unique values in the 'Severity' column
print(df['Severity'].value_counts())

short_delay    228852
long_delay      48189
Name: Severity, dtype: int64


In [11]:
# Rename columns to get rid of () with measurement for plotting
df.rename(columns = {'Temperature(F)':'Temperature', 'Wind_Speed(mph)':'Wind_Speed', 'Precipitation(in)':'Precipitation', 'Visibility(mi)': 'Visibility'}, inplace = True)
df.head()                          

Unnamed: 0,Severity,State,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset,year,date
0,long_delay,NJ,89.0,10.0,5.0,0.0,clear_weather,Day,2019,10/2/19
1,long_delay,LA,54.0,10.0,5.0,0.0,clear_weather,Day,2019,11/1/19
2,long_delay,AZ,41.0,10.0,10.0,0.0,clear_weather,Night,2020,2/16/20
3,long_delay,TX,79.0,10.0,25.0,0.0,bad_weather,Day,2019,6/4/19
4,long_delay,VA,37.0,10.0,0.0,0.0,clear_weather,Night,2019,10/19/19


In [12]:
# Drop State, year and date columns to reduce factors for NOW
df.drop('State', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)
df.head()

Unnamed: 0,Severity,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
0,long_delay,89.0,10.0,5.0,0.0,clear_weather,Day
1,long_delay,54.0,10.0,5.0,0.0,clear_weather,Day
2,long_delay,41.0,10.0,10.0,0.0,clear_weather,Night
3,long_delay,79.0,10.0,25.0,0.0,bad_weather,Day
4,long_delay,37.0,10.0,0.0,0.0,clear_weather,Night


In [13]:
# Transform Sunrise_Sunset and Weather_Condition to binary input
df.Sunrise_Sunset = df.Sunrise_Sunset.replace({'Day': 1, 'Night': 0}) 
df.Weather_Condition = df.Weather_Condition.replace({'clear_weather': 1, 'bad_weather': 0})
df.head()

Unnamed: 0,Severity,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
0,long_delay,89.0,10.0,5.0,0.0,1,1
1,long_delay,54.0,10.0,5.0,0.0,1,1
2,long_delay,41.0,10.0,10.0,0.0,1,0
3,long_delay,79.0,10.0,25.0,0.0,0,1
4,long_delay,37.0,10.0,0.0,0.0,1,0


In [14]:
# Get count of unique values in the 'Weather_Condition' column
print(df['Weather_Condition'].value_counts())

1    228911
0     48130
Name: Weather_Condition, dtype: int64


In [17]:
# Calculate Correlations
df.corr()

Unnamed: 0,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
Temperature,1.0,0.288617,-0.008902,-0.02855,0.282423,0.296735
Visibility,0.288617,1.0,-0.017753,-0.198338,0.606688,0.057301
Wind_Speed,-0.008902,-0.017753,1.0,0.056201,-0.119258,0.172319
Precipitation,-0.02855,-0.198338,0.056201,1.0,-0.270177,-0.011266
Weather_Condition,0.282423,0.606688,-0.119258,-0.270177,1.0,0.039278
Sunrise_Sunset,0.296735,0.057301,0.172319,-0.011266,0.039278,1.0


In [20]:
# Create our target
y = df["Severity"]
X = df.drop(columns="Severity")


### **SVM - Data Scalar**

In [21]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [22]:
# Scale the data
#data_scaled = data_scaler.fit_transform(X)
X[['Temperature', 'Visibility', 'Wind_Speed', 'Precipitation', 'Sunrise_Sunset',
        'Weather_Condition']] = StandardScaler().fit_transform(X[['Temperature', 'Visibility', 'Wind_Speed', 'Precipitation', 'Sunrise_Sunset',
        'Weather_Condition']]) 

In [23]:
# Look at Scaled data
X[:5]

Unnamed: 0,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
0,1.618838,0.370301,-0.356655,-0.144057,0.458537,0.723719
1,-0.251451,0.370301,-0.356655,-0.144057,0.458537,0.723719
2,-0.946129,0.370301,0.547149,-0.144057,0.458537,-1.381752
3,1.08447,0.370301,3.258561,-0.144057,-2.180848,0.723719
4,-1.159877,0.370301,-1.26046,-0.144057,0.458537,-1.381752


In [24]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [25]:
X.describe()

Unnamed: 0,Temperature,Visibility,Wind_Speed,Precipitation,Weather_Condition,Sunrise_Sunset
count,277041.0,277041.0,277041.0,277041.0,277041.0,277041.0
mean,1.740464e-14,-2.973242e-14,-1.307284e-13,-5.41094e-15,-4.788316e-14,-4.588583e-13
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-4.579833,-2.911543,-1.26046,-0.1440573,-2.180848,-1.381752
25%,-0.6255086,0.3703012,-0.7181771,-0.1440573,0.4585372,-1.381752
50%,0.06917009,0.3703012,-0.1758947,-0.1440573,0.4585372,0.7237188
75%,0.7104119,0.3703012,0.5471485,-0.1440573,0.4585372,0.7237188
max,2.901322,26.62506,29.28812,211.585,0.4585372,0.7237188


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({'short_delay': 171638, 'long_delay': 36142})

In [27]:
model.fit(X_train, y_train)

SVC(kernel='linear')

In [28]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()


Unnamed: 0,Prediction,Actual
0,short_delay,long_delay
1,short_delay,long_delay
2,short_delay,short_delay
3,short_delay,short_delay
4,short_delay,short_delay


In [29]:
# Calculate the Accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8260637299490333

In [30]:
# Display the confusion matrix

from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual long_delay", "Actual short_delay"], columns=["Predicted long_delay", "Predicted short_delay"])
cm_df

Unnamed: 0,Predicted long_delay,Predicted short_delay
Actual long_delay,0,12047
Actual short_delay,0,57214


In [31]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  long_delay       0.00      0.00      0.00     12047
 short_delay       0.83      1.00      0.90     57214

    accuracy                           0.83     69261
   macro avg       0.41      0.50      0.45     69261
weighted avg       0.68      0.83      0.75     69261

