In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
import warnings
warnings.simplefilter('ignore')

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
df = pd.read_csv('/content/drive/MyDrive/UK_Accident.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1504150 entries, 0 to 1504149
Data columns (total 33 columns):
 #   Column                                       Non-Null Count    Dtype  
---  ------                                       --------------    -----  
 0   Unnamed: 0                                   1504150 non-null  int64  
 1   Accident_Index                               1504150 non-null  object 
 2   Location_Easting_OSGR                        1504049 non-null  float64
 3   Location_Northing_OSGR                       1504150 non-null  float64
 4   Longitude                                    1504049 non-null  float64
 5   Latitude                                     1504150 non-null  float64
 6   Police_Force                                 1504150 non-null  int64  
 7   Accident_Severity                            1504150 non-null  int64  
 8   Number_of_Vehicles                           1504150 non-null  int64  
 9   Number_of_Casualties                         1

In [69]:
df.drop(columns=['Unnamed: 0', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
                 'Local_Authority_(Highway)', 'LSOA_of_Accident_Location', 'Special_Conditions_at_Site',
                 'Carriageway_Hazards', 'Year'], inplace=True)

df.dropna(subset=['Longitude', 'Time', 'Pedestrian_Crossing-Human_Control',
                  'Pedestrian_Crossing-Physical_Facilities', 'Junction_Control'], inplace=True)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1503890 entries, 0 to 1504149
Data columns (total 25 columns):
 #   Column                                       Non-Null Count    Dtype  
---  ------                                       --------------    -----  
 0   Accident_Index                               1503890 non-null  object 
 1   Longitude                                    1503890 non-null  float64
 2   Latitude                                     1503890 non-null  float64
 3   Police_Force                                 1503890 non-null  int64  
 4   Accident_Severity                            1503890 non-null  int64  
 5   Number_of_Vehicles                           1503890 non-null  int64  
 6   Number_of_Casualties                         1503890 non-null  int64  
 7   Date                                         1503890 non-null  object 
 8   Day_of_Week                                  1503890 non-null  int64  
 9   Time                                         1

In [71]:
df.shape

(1503890, 25)

In [72]:
df.drop(columns=['Date', 'Time', 'Accident_Index'], inplace=True)

In [80]:
cat_cols=[feature for feature in df.columns if df[feature].dtype=='O']
for feature in cat_cols:
    print(f'The {feature} has following number of {len(df[feature].unique())}')

The Road_Type has following number of 6
The Junction_Control has following number of 5
The Pedestrian_Crossing-Human_Control has following number of 3
The Pedestrian_Crossing-Physical_Facilities has following number of 6
The Light_Conditions has following number of 5
The Weather_Conditions has following number of 9
The Road_Surface_Conditions has following number of 6
The Did_Police_Officer_Attend_Scene_of_Accident has following number of 2


In [81]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
for feature in cat_cols:
    df[feature]=labelencoder.fit_transform(df[feature])

In [82]:
X = df.drop(columns=['Accident_Severity'], axis=1)
y = df[['Accident_Severity']]

In [83]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

oversample = RandomOverSampler()

X, y = oversample.fit_resample(X, y)
x_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=123)

In [86]:
clf = RandomForestClassifier(n_estimators=8, n_jobs=None, random_state=123)
clf.fit(x_train, y_train)

In [102]:
pred = clf.predict(x_test)
clf.score(x_test, y_test)

0.9454100754796384

In [100]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

Mean Absolute Error: 0.05679097994362465
Mean Squared Error: 0.06119309079015077
Root Mean Squared Error: 0.24737237273016316


In [89]:
feature_importances = clf.feature_importances_
feature_names = X.columns.to_list()

In [92]:
feature_importances_with_names = zip(feature_names, feature_importances)

# Sort the features by importance (optional)
sorted_features = sorted(feature_importances_with_names, key=lambda x: x[1], reverse=True)

# Print the feature names and their importances
for feature, importance in sorted_features:
    print(f"{feature:<50} {importance:.5f}")

Longitude                                          0.17869
Latitude                                           0.17688
1st_Road_Number                                    0.10130
Local_Authority_(District)                         0.09941
Day_of_Week                                        0.08036
Police_Force                                       0.04598
Number_of_Vehicles                                 0.03602
Speed_limit                                        0.03460
Number_of_Casualties                               0.02945
2nd_Road_Number                                    0.02606
Did_Police_Officer_Attend_Scene_of_Accident        0.02485
1st_Road_Class                                     0.02463
Light_Conditions                                   0.02297
Weather_Conditions                                 0.02077
2nd_Road_Class                                     0.01956
Road_Surface_Conditions                            0.01927
Pedestrian_Crossing-Physical_Facilities            0.017

# สรุปความสำคัญของฟีเจอร์ 5 ลำดับ
1. Longitude และ Latitude  อาจตีความได้ว่าการเกิดอุบัติเหตุนั้นเกิดขึ้นในจุดเดิมบ่อยครั้ง
2. 1st_Road_Number และ Local_Authority_(District) รหัสของถนนที่ และเขตอำเภอ บ่งบอกว่าเกิดอุบัติเหตุที่ถนนเดิมบ่อย
3. Day_of_Week วันส่งผลต่อความรุนแรงอุบัติเหตุ
4. Police_Force มีตำรวจในที่เกิดเหตุส่งผลต่อความรุนแรง สรุปได้ว่าถ้ามีตำรวจความรุนแรงจะน้อยลง
5. Number_of_Vehicles จำนวนรถ ยิ่งรถเยอะความรุนแรงการเกิดอุบัติเหตุยิ่งสูง



