# Imports

In [1]:
import numpy as np
import pandas as pd
import warnings
import os
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
# Create the directory if it doesn't exist
save_dir = '/content/drive/My Drive/XAI/processedData'
os.makedirs(save_dir, exist_ok=True)

In [None]:
month_names = {
    1: 'Jan',
    2: 'Feb',
    3: 'Mar',
    4: 'Apr',
    5: 'May',
    6: 'Jun',
    7: 'Jul',
    8: 'Aug',
    9: 'Sep',
    10: 'Oct',
    11: 'Nov',
    12: 'Dec'
}

day_names = {
    1: 'Mon',
    2: 'Tues',
    3: 'Wed',
    4: 'Thur',
    5: 'Fri',
    6: 'Sat',
    7: 'Sun'
}

# Determine season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

def get_required_df(x, y, y_pred):
  # Create a new DataFrame with month and season
  new_df = pd.DataFrame()

  new_df['month'] = x['Start_Month'].astype(int).map(month_names)
  new_df['season'] = x['Start_Month'].apply(get_season)
  new_df['Accident_Severity'] = y
  new_df['Accident_Severity_pred'] = y_pred

  return new_df

def get_required_df_2(x, y, y_pred):
  # Create a new DataFrame with month and season
  new_df = pd.DataFrame()

  new_df['month'] = x['Month'].astype(int).map(month_names)
  new_df['season'] = x['Month'].apply(get_season)
  new_df['day_of_week'] = x['Day_of_week'].astype(int).map(day_names)
  new_df['Accident_Severity'] = y
  new_df['Accident_Severity_pred'] = y_pred

  return new_df

def get_required_df_3(x, y, y_pred):
  # Create a new DataFrame with month and season
  new_df = pd.DataFrame()

  new_df['month'] = x['Month'].astype(int).map(month_names)
  new_df['season'] = x['Month'].apply(get_season)
  # new_df['day_of_week'] = x['Day_of_Week'].astype(int).map(day_names)
  new_df['Accident_Severity'] = y
  new_df['Accident_Severity_pred'] = y_pred
  new_df[['Longitude', 'Latitude']] = x[['Longitude', 'Latitude']]

  return new_df

# USA

In [None]:
from google.colab import files
files.upload()

In [6]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle datasets download -d sobhanmoosavi/us-accidents

In [8]:
import zipfile
zip_ref = zipfile.ZipFile('/content/us-accidents.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [9]:
# Load and read the file
df = pd.read_csv("/content/US_Accidents_March23.csv", nrows=200000)
df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


In [10]:
print(df.shape)

# Find columns with null percentage greater than 80%
null_percentage_threshold = 20
high_null_columns = df.columns[df.isnull().mean() * 100 > null_percentage_threshold]

print("\nColumns with null percentage greater than 80%:")
print(high_null_columns)

# Remove columns with null percentage greater than 80%
df = df.drop(columns=high_null_columns)

print("\n",df.shape)

(200000, 46)

Columns with null percentage greater than 80%:
Index(['End_Lat', 'End_Lng', 'Wind_Chill(F)', 'Precipitation(in)'], dtype='object')

 (200000, 42)


In [11]:
# Drop ID column
df.drop(columns = ['ID',
                   'Description',
                   'Airport_Code',
                   'County',
                   'City',
                   'Country',
                   'Wind_Direction',
                   'Amenity',
                   'Weather_Timestamp',
                   'Timezone',
                   'Give_Way',
                   'Station',
                   'Stop',
                   'No_Exit'], inplace = True)

In [12]:
# removing rows with null values
print(df.shape)
df.dropna(inplace = True)
print(df.shape)

(200000, 28)
(159096, 28)


In [13]:
# Assuming 'Start_Time' and 'End_Time' are in datetime format
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['End_Time'] = pd.to_datetime(df['End_Time'])

# Create new columns
df['Start_Day'] = df['Start_Time'].dt.day
df['Start_Hour'] = df['Start_Time'].dt.hour
df['Start_Month'] = df['Start_Time'].dt.month
df['Start_Year'] = df['Start_Time'].dt.year

df['End_Day'] = df['End_Time'].dt.day
df['End_Hour'] = df['End_Time'].dt.hour
df['End_Month'] = df['End_Time'].dt.month
df['End_Year'] = df['End_Time'].dt.year

# Drop the original columns
df = df.drop(columns=['Start_Time', 'End_Time'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159096 entries, 2 to 199999
Data columns (total 34 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Source                 159096 non-null  object 
 1   Severity               159096 non-null  int64  
 2   Start_Lat              159096 non-null  float64
 3   Start_Lng              159096 non-null  float64
 4   Distance(mi)           159096 non-null  float64
 5   Street                 159096 non-null  object 
 6   State                  159096 non-null  object 
 7   Zipcode                159096 non-null  object 
 8   Temperature(F)         159096 non-null  float64
 9   Humidity(%)            159096 non-null  float64
 10  Pressure(in)           159096 non-null  float64
 11  Visibility(mi)         159096 non-null  float64
 12  Wind_Speed(mph)        159096 non-null  float64
 13  Weather_Condition      159096 non-null  object 
 14  Bump                   159096 non-null  b

In [15]:
# encoding categorical data
categorical_columns = df.select_dtypes(include=['object','bool']).columns
categorical_columns

Index(['Source', 'Street', 'State', 'Zipcode', 'Weather_Condition', 'Bump',
       'Crossing', 'Junction', 'Railway', 'Roundabout', 'Traffic_Calming',
       'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight'],
      dtype='object')

In [16]:
from sklearn import preprocessing
label_encoders = {}

# Encoding categorical columns
for col in categorical_columns:
    le = preprocessing.LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),Street,State,Zipcode,Temperature(F),Humidity(%),...,Nautical_Twilight,Astronomical_Twilight,Start_Day,Start_Hour,Start_Month,Start_Year,End_Day,End_Hour,End_Month,End_Year
2,0,2,39.063148,-84.032608,0.01,16533,14,8392,36.0,100.0,...,0,0,8,6,2,2016,8,7,2,2016
3,0,3,39.747753,-84.205582,0.01,8164,14,8530,35.1,96.0,...,0,0,8,7,2,2016,8,7,2,2016
4,0,2,39.627781,-84.188354,0.01,10294,14,8590,36.0,89.0,...,0,0,8,7,2,2016,8,8,2,2016
5,0,3,40.10059,-82.925194,0.01,19039,14,8235,37.9,97.0,...,0,0,8,7,2,2016,8,8,2,2016
6,0,2,39.758274,-84.230507,0.0,11740,14,8536,34.0,100.0,...,0,0,8,7,2,2016,8,8,2,2016


In [17]:
# Separate features (X) and target variable (y)

y = df['Severity'].copy()
X = df.drop('Severity', axis=1).copy()
column_names = X.columns

column_names

Index(['Source', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Street', 'State',
       'Zipcode', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Speed(mph)', 'Weather_Condition', 'Bump',
       'Crossing', 'Junction', 'Railway', 'Roundabout', 'Traffic_Calming',
       'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight', 'Start_Day', 'Start_Hour',
       'Start_Month', 'Start_Year', 'End_Day', 'End_Hour', 'End_Month',
       'End_Year'],
      dtype='object')

In [18]:
y = y-1
y.unique()

array([1, 2, 0, 3])

In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

X_train:  (127276, 33)
y_train:  (127276,)
X_test:  (31820, 33)
y_test:  (31820,)


In [20]:
model = xgb.XGBClassifier()

In [21]:
# Train the model
import time

start_time = time.time()
model.fit(X_train, y_train, verbose=True)
end_time = time.time()

elapsed_time = end_time - start_time
print("Time taken for model fitting:", elapsed_time, "seconds")

Time taken for model fitting: 7.2807793617248535 seconds


In [22]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9202702702702703


In [24]:
new_df_US = get_required_df(X_test, y_test, y_pred)
new_df_US.head(5)

# Save the DataFrame to a CSV file using os.path.join
new_df_US.to_csv(os.path.join(save_dir, 'new_df_US.csv'), index=False)

# Ethiopia

In [34]:
# Load and read the file|
df = pd.read_csv("/content/drive/My Drive/Research Internship/dataset/RTA Dataset.csv")
df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [35]:
print(df.shape)

# Find columns with null percentage greater than 80%
null_percentage_threshold = 20
high_null_columns = df.columns[df.isnull().mean() * 100 > null_percentage_threshold]

print("\nColumns with null percentage greater than 80%:")
print(high_null_columns)

# Remove columns with null percentage greater than 80%
df = df.drop(columns=high_null_columns)

print("\n",df.shape)

(12316, 32)

Columns with null percentage greater than 80%:
Index(['Service_year_of_vehicle', 'Defect_of_vehicle', 'Work_of_casuality',
       'Fitness_of_casuality'],
      dtype='object')

 (12316, 28)


In [36]:
# Drop
df.drop(columns = [
    'Owner_of_vehicle',
    'Number_of_casualties',
    'Sex_of_casualty',
    'Age_band_of_casualty',
], inplace = True)

In [37]:
# removing rows with null values
print(df.shape)
df.dropna(inplace = True)
print(df.shape)

(12316, 24)
(8664, 24)


In [38]:
df['Time'] = pd.to_datetime(df['Time'])

# Create new columns
df['Day'] = df['Time'].dt.day
df['Hour'] = df['Time'].dt.hour
df['Month'] = df['Time'].dt.month
df['Year'] = df['Time'].dt.year

# Drop the original columns
df = df.drop(columns=['Time'])

In [39]:
# Dictionary to map day names to integers
day_to_int = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}

# Convert the 'dayofweek' column to integers using the map function
df['Day_of_week'] = df['Day_of_week'].map(day_to_int)

In [40]:
# # encoding categorical data
categorical_columns = df.select_dtypes(include=['object','bool']).columns
categorical_columns

Index(['Age_band_of_driver', 'Sex_of_driver', 'Educational_level',
       'Vehicle_driver_relation', 'Driving_experience', 'Type_of_vehicle',
       'Area_accident_occured', 'Lanes_or_Medians', 'Road_allignment',
       'Types_of_Junction', 'Road_surface_type', 'Road_surface_conditions',
       'Light_conditions', 'Weather_conditions', 'Type_of_collision',
       'Vehicle_movement', 'Casualty_class', 'Casualty_severity',
       'Pedestrian_movement', 'Cause_of_accident', 'Accident_severity'],
      dtype='object')

In [41]:
from sklearn import preprocessing
label_encoders = {}

# Encoding categorical columns
for col in categorical_columns:
    le = preprocessing.LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head(10)

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,...,Vehicle_movement,Casualty_class,Casualty_severity,Pedestrian_movement,Cause_of_accident,Accident_severity,Day,Hour,Month,Year
1,1,1,1,4,0,3,11,6,4,5,...,2,3,3,5,16,2,24,17,4,2024
3,7,0,1,4,0,2,11,6,6,6,...,2,2,2,5,1,2,24,1,4,2024
7,5,0,1,4,0,1,0,9,6,5,...,10,3,3,5,12,2,24,17,4,2024
8,5,0,1,4,0,3,5,4,6,5,...,2,2,2,0,1,2,24,17,4,2024
9,5,0,1,4,0,0,0,9,4,5,...,10,1,2,5,9,1,24,17,4,2024
10,6,0,1,0,2,0,10,9,6,5,...,9,3,3,5,0,1,24,14,4,2024
11,6,1,1,0,0,5,0,6,4,5,...,2,0,2,5,11,1,24,14,4,2024
12,4,0,1,4,0,0,11,6,0,0,...,2,3,3,5,10,2,24,17,4,2024
13,4,1,1,4,0,2,5,6,6,5,...,12,3,3,5,12,2,24,17,4,2024
14,4,1,1,4,0,3,0,6,4,0,...,2,0,2,5,10,1,24,17,4,2024


In [42]:
# Separate features (X) and target variable (y)

y = df['Accident_severity']  # Labels
X = df.drop(columns=['Accident_severity'])  # Features
column_names = X.columns

column_names

Index(['Day_of_week', 'Age_band_of_driver', 'Sex_of_driver',
       'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
       'Type_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians',
       'Road_allignment', 'Types_of_Junction', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Type_of_collision', 'Number_of_vehicles_involved', 'Vehicle_movement',
       'Casualty_class', 'Casualty_severity', 'Pedestrian_movement',
       'Cause_of_accident', 'Day', 'Hour', 'Month', 'Year'],
      dtype='object')

In [43]:
y.unique()

array([2, 1, 0])

In [44]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

X_train:  (6931, 26)
y_train:  (6931,)
X_test:  (1733, 26)
y_test:  (1733,)


In [45]:
model_xgboost_normal = xgb.XGBClassifier()

In [46]:
start_time = time.time()
model_xgboost_normal.fit(X_train, y_train, verbose=True)
end_time = time.time()

elapsed_time = end_time - start_time
print("Time taken for model fitting:", elapsed_time, "seconds")

Time taken for model fitting: 0.8028266429901123 seconds


In [47]:
# Make predictions on the test set
y_pred = model_xgboost_normal.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.845931909982689


In [51]:
new_df_Ethiopoa = get_required_df_2(X_test, y_test, y_pred)

# Save the DataFrame to a CSV file using os.path.join
new_df_Ethiopoa.to_csv(os.path.join(save_dir, 'new_df_Ethiopoa.csv'), index=False)

new_df_Ethiopoa.head(5)

Unnamed: 0,month,season,day_of_week,Accident_Severity,Accident_Severity_pred
1349,Apr,Spring,Tues,2,2
3784,Apr,Spring,Tues,2,2
6404,Apr,Spring,Sat,2,2
1480,Apr,Spring,Fri,2,2
1144,Apr,Spring,Fri,2,2


# UK

In [105]:
# Load and read the file
df = pd.read_csv('/content/drive/My Drive/Research Internship/dataset/UK_Accident.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,...,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location,Year
0,0,200501BS00001,525680.0,178240.0,-0.19117,51.489096,1,2,1,1,...,Zebra crossing,Daylight: Street light present,Raining without high winds,Wet/Damp,,,1,Yes,E01002849,2005
1,1,200501BS00002,524170.0,181650.0,-0.211708,51.520075,1,3,1,1,...,Pedestrian phase at traffic signal junction,Darkness: Street lights present and lit,Fine without high winds,Dry,,,1,Yes,E01002909,2005
2,2,200501BS00003,524520.0,182240.0,-0.206458,51.525301,1,3,2,1,...,No physical crossing within 50 meters,Darkness: Street lights present and lit,Fine without high winds,Dry,,,1,Yes,E01002857,2005
3,3,200501BS00004,526900.0,177530.0,-0.173862,51.482442,1,3,1,1,...,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,E01002840,2005
4,4,200501BS00005,528060.0,179040.0,-0.156618,51.495752,1,3,1,1,...,No physical crossing within 50 meters,Darkness: Street lighting unknown,Fine without high winds,Wet/Damp,,,1,Yes,E01002863,2005


In [77]:
# Data Cleaning
df.isnull().sum()

Unnamed: 0                                           0
Accident_Index                                       0
Location_Easting_OSGR                              101
Location_Northing_OSGR                               0
Longitude                                          101
Latitude                                             0
Police_Force                                         0
Accident_Severity                                    0
Number_of_Vehicles                                   0
Number_of_Casualties                                 0
Date                                                 0
Day_of_Week                                          0
Time                                               117
Local_Authority_(District)                           0
Local_Authority_(Highway)                            0
1st_Road_Class                                       0
1st_Road_Number                                      0
Road_Type                                            0
Speed_limi

In [106]:
all_columns = df.columns
all_columns

Index(['Unnamed: 0', 'Accident_Index', 'Location_Easting_OSGR',
       'Location_Northing_OSGR', 'Longitude', 'Latitude', 'Police_Force',
       'Accident_Severity', 'Number_of_Vehicles', 'Number_of_Casualties',
       'Date', 'Day_of_Week', 'Time', 'Local_Authority_(District)',
       'Local_Authority_(Highway)', '1st_Road_Class', '1st_Road_Number',
       'Road_Type', 'Speed_limit', 'Junction_Control', '2nd_Road_Class',
       '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'LSOA_of_Accident_Location', 'Year'],
      dtype='object')

In [107]:
print(df.shape)

# Find columns with null percentage greater than 80%
null_percentage_threshold = 80
high_null_columns = df.columns[df.isnull().mean() * 100 > null_percentage_threshold]

print("\nColumns with null percentage greater than 80%:")
print(high_null_columns)

# Remove columns with null percentage greater than 80%
df = df.drop(columns=high_null_columns)

print("\n",df.shape)

(1504150, 33)

Columns with null percentage greater than 80%:
Index(['Special_Conditions_at_Site', 'Carriageway_Hazards'], dtype='object')

 (1504150, 31)


In [108]:
# Drop
df.drop(columns = [
    'Unnamed: 0',
    'Accident_Index',
    'Police_Force',
    'Did_Police_Officer_Attend_Scene_of_Accident',
    'LSOA_of_Accident_Location',
    'Local_Authority_(District)',
    'Local_Authority_(Highway)',
], inplace = True)

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1504150 entries, 0 to 1504149
Data columns (total 24 columns):
 #   Column                                   Non-Null Count    Dtype  
---  ------                                   --------------    -----  
 0   Location_Easting_OSGR                    1504049 non-null  float64
 1   Location_Northing_OSGR                   1504150 non-null  float64
 2   Longitude                                1504049 non-null  float64
 3   Latitude                                 1504150 non-null  float64
 4   Accident_Severity                        1504150 non-null  int64  
 5   Number_of_Vehicles                       1504150 non-null  int64  
 6   Number_of_Casualties                     1504150 non-null  int64  
 7   Date                                     1504150 non-null  object 
 8   Day_of_Week                              1504150 non-null  int64  
 9   Time                                     1504033 non-null  object 
 10  1st_Road_Class    

In [110]:
# removing rows with null values
print(df.shape)
df.dropna(inplace = True)
print(df.shape)

(1504150, 24)
(901196, 24)


In [112]:
# Convert 'Date' column to datetime with the specified format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Create new columns
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Drop the original columns
df = df.drop(columns=['Date'])

In [113]:
# Dictionary to map day names to integers
day_to_int = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}

# Convert the 'dayofweek' column to integers using the map function
df['Day_of_Week'] = df['Day_of_Week'].map(day_to_int)

In [114]:
# encoding categorical data
categorical_columns = df.select_dtypes(include=['object','bool']).columns
categorical_columns

Index(['Time', 'Road_Type', 'Junction_Control',
       'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions'],
      dtype='object')

In [115]:
from sklearn import preprocessing
label_encoders = {}

# Encoding categorical columns
for col in categorical_columns:
    le = preprocessing.LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

Unnamed: 0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,Time,1st_Road_Class,...,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Year,Day,Month
1,524170.0,181650.0,-0.211708,51.520075,3,1,1,,1055,4,...,0,2,3,2,1,0,1,2005,5,1
6,524220.0,180830.0,-0.211277,51.512695,3,2,1,,1239,5,...,0,2,2,2,1,0,1,2005,13,1
8,527350.0,177650.0,-0.167342,51.48342,3,2,2,,1362,3,...,304,2,3,2,1,0,1,2005,15,1
9,524550.0,180810.0,-0.206531,51.512443,3,2,5,,959,4,...,0,2,0,4,1,0,1,2005,15,1
10,526240.0,178900.0,-0.182872,51.494902,3,1,1,,41,3,...,325,2,3,2,1,0,1,2005,16,1


In [116]:
# Separate features (X) and target variable (y)

y = df['Accident_Severity'].copy()
X = df.drop('Accident_Severity', axis=1).copy()
column_names = X.columns

column_names

Index(['Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude',
       'Latitude', 'Number_of_Vehicles', 'Number_of_Casualties', 'Day_of_Week',
       'Time', '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       'Junction_Control', '2nd_Road_Class', '2nd_Road_Number',
       'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions', 'Urban_or_Rural_Area',
       'Year', 'Day', 'Month'],
      dtype='object')

In [117]:
y.unique()
y = y-1
y.unique()

array([2, 1, 0])

In [118]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

X_train:  (720956, 24)
y_train:  (720956,)
X_test:  (180240, 24)
y_test:  (180240,)


In [119]:
model = xgb.XGBClassifier()

In [120]:
# Train the model
import time

start_time = time.time()
model.fit(X_train, y_train, verbose=True)
end_time = time.time()

elapsed_time = end_time - start_time
print("Time taken for model fitting:", elapsed_time, "seconds")

Time taken for model fitting: 77.40510940551758 seconds


In [121]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8710108743897026


In [122]:
new_df_uk = get_required_df_3(X_test, y_test, y_pred)

# Save the DataFrame to a CSV file using os.path.join
new_df_uk.to_csv(os.path.join(save_dir, 'new_df_uk.csv'), index=False)

new_df_uk.head(5)

Unnamed: 0,month,season,Accident_Severity,Accident_Severity_pred,Longitude,Latitude
41057,Dec,Winter,2,2,-2.223483,53.403991
895329,Oct,Fall,2,2,0.162341,51.574568
1432266,Jan,Winter,2,2,-0.813786,53.065317
1252582,Jun,Summer,2,2,-2.331315,53.401943
1298387,Dec,Winter,2,2,0.11051,52.21299
