# Machine Learning 

#### 1. Encoding

In [1]:
#import necessary files used for encoding 
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#### 2. Settings

In [4]:
#settings for encoding
RAW = "road_accident_dataset.csv"   
TARGET = "Accident Severity"         #tv will be handled separately  
OUT_X = "X_encoded.csv"
OUT_y = "y_labels.csv"
OUT_MAPPING = "label_mapping.json"

#Load dataset
df = pd.read_csv(RAW)

#### 3. Split features and target

In [5]:
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found.")
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(str)

#### 4. Identify column types

In [7]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist() #cat_cols are columns containing text (need encoding)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist() #num_cols will keep as they are

#### 5. One-hot encode categorical columns

In [9]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat = ohe.fit_transform(X[cat_cols]) if cat_cols else np.empty((len(X), 0))

#### 6. Combine numeric + encoded categorical columns

In [None]:
ohe_cols = ohe.get_feature_names_out(cat_cols)
X_num = X[num_cols].reset_index(drop=True)
X_cat_df = pd.DataFrame(X_cat, columns=ohe_cols, index=X.index)
X_enc = pd.concat([X_num, X_cat_df], axis=1)

**Observation**: What we get now is a fully numeric dataset :)

#### 7. Encode target labels

In [None]:

le = LabelEncoder()
y_encoded = le.fit_transform(y) 

#### 8. Save output files

In [14]:
X_enc.to_csv(OUT_X, index=False)
pd.DataFrame({"y": y_encoded}).to_csv(OUT_y, index=False)

# Save target label mapping 
label_map = {int(i): cls for i, cls in enumerate(le.classes_)}
with open(OUT_MAPPING, "w") as f:
    json.dump(label_map, f, indent=2)