# Machine Learning 

#### 1. Encoding

In [23]:
#import necessary files used for encoding 
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#### 2. Settings

In [24]:
#settings for encoding
RAW = "road_accident_dataset.csv"   
TARGET = "Accident Severity"         #tv will be handled separately  
OUT_X = "X_encoded.csv"
OUT_y = "y_labels.csv"
OUT_MAPPING = "label_mapping.json"

#Load dataset
df = pd.read_csv(RAW)

#### 3. Split features and target

In [25]:
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found.")
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(str)

#### 4. Identify column types

In [26]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist() #cat_cols are columns containing text (need encoding)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist() #num_cols will keep as they are

#### 5. One-hot encode categorical columns

In [27]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat = ohe.fit_transform(X[cat_cols]) if cat_cols else np.empty((len(X), 0))

#### 6. Combine numeric + encoded categorical columns

In [28]:
ohe_cols = ohe.get_feature_names_out(cat_cols)
X_num = X[num_cols].reset_index(drop=True)
X_cat_df = pd.DataFrame(X_cat, columns=ohe_cols, index=X.index)
X_enc = pd.concat([X_num, X_cat_df], axis=1)

# Fix column names for XGBoost
X_enc.columns = [col.replace("[","_").replace("]","_").replace("<","_").replace(">","_") for col in X_enc.columns]

**Observation**: What we get now is a fully numeric dataset :)

#### 7. Encode target labels

In [29]:

le = LabelEncoder()
y_encoded = le.fit_transform(y) 

#### 8. Save output files

In [30]:
X_enc.to_csv(OUT_X, index=False)
pd.DataFrame({"y": y_encoded}).to_csv(OUT_y, index=False)

# Save target label mapping 
label_map = {int(i): cls for i, cls in enumerate(le.classes_)}
with open(OUT_MAPPING, "w") as f:
    json.dump(label_map, f, indent=2)

### 9. Data Splitting

In [31]:
from sklearn.model_selection import train_test_split
X = X_enc  # fully encoded features (numeric + one-hot)
y = y_encoded

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42, stratify=y
)

In [33]:
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

Training samples: 92400, Testing samples: 39600


### 10. Model Training 

Random Forest 

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


XGB

In [35]:
from xgboost import XGBClassifier
X_train.columns = [col.replace("[","_").replace("]","_").replace("<","_").replace(">","_") for col in X_train.columns]
X_test.columns = [col.replace("[","_").replace("]","_").replace("<","_").replace(">","_") for col in X_test.columns]

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


### 11. Model Testing

In [36]:
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

### 12. Model Evaluation

In [37]:
from sklearn.metrics import classification_report, confusion_matrix
print("\n--- Random Forest Classification Report ---")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

print("\n--- XGBoost Classification Report ---")
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))


--- Random Forest Classification Report ---
              precision    recall  f1-score   support

       Minor       0.34      0.36      0.35     13219
    Moderate       0.33      0.33      0.33     13201
      Severe       0.33      0.31      0.32     13180

    accuracy                           0.33     39600
   macro avg       0.33      0.33      0.33     39600
weighted avg       0.33      0.33      0.33     39600


--- XGBoost Classification Report ---
              precision    recall  f1-score   support

       Minor       0.33      0.33      0.33     13219
    Moderate       0.33      0.33      0.33     13201
      Severe       0.33      0.33      0.33     13180

    accuracy                           0.33     39600
   macro avg       0.33      0.33      0.33     39600
weighted avg       0.33      0.33      0.33     39600

