In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

# Load and prepare data
df = pd.read_excel('translated_file_2020to25.xlsx')
df['Date Occured'] = pd.to_datetime(df['Date Occured'], errors='coerce')
df['month'] = df['Date Occured'].dt.month
df['weekday'] = df['Date Occured'].dt.weekday
df['year'] = df['Date Occured'].dt.year

# Fill full_text field for NLP
df['full_text'] = (
    df['Title_EN'].fillna('') + ' ' +
    df['Description_EN'].fillna('') + ' ' +
    df['Actions taken_EN'].fillna('')
)

# Label encode target columns
label_encoders = {}
for col in ['Type of Injury', 'Kind of event hazard']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Save label encoders
joblib.dump(label_encoders, 'incident_label_encoders.pkl')

# Save preprocessed data for modeling
df.to_csv('preprocessed_data_2020to25.csv', index=False)
print("✅ Preprocessing done and encoders saved.")

✅ Preprocessing done and encoders saved.


In [3]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Load preprocessed file and encoders
df = pd.read_csv('preprocessed_data_2020to25.csv')
label_encoders = joblib.load('incident_label_encoders.pkl')

# Features & targets
X = df[['Reporting Site', 'Sub Type', 'month', 'weekday', 'full_text']]
y = df[['Type of Injury', 'Kind of event hazard']]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
text_transformer = TfidfVectorizer(max_features=200)
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'full_text'),
        ('cat', cat_transformer, ['Reporting Site', 'Sub Type']),
        ('num', 'passthrough', ['month', 'weekday'])
    ]
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train_transformed, y_train)

# Predict
y_pred = model.predict(X_test_transformed)

# Decode predictions
y_pred_df = pd.DataFrame(y_pred, columns=y.columns)
y_test_df = y_test.reset_index(drop=True)

for col in y_pred_df.columns:
    y_pred_df[col] = label_encoders[col].inverse_transform(y_pred_df[col])
    y_test_df[col] = label_encoders[col].inverse_transform(y_test_df[col])

# Evaluate
print("\n📊 Classification Report for Type of Injury:")
print(classification_report(y_test_df['Type of Injury'], y_pred_df['Type of Injury']))

print("\n📊 Classification Report for Kind of Event Hazard:")
print(classification_report(y_test_df['Kind of event hazard'], y_pred_df['Kind of event hazard'], zero_division=0))

# Save model and encoders
joblib.dump(model, 'incident_predictor_model.pkl')
joblib.dump(preprocessor, 'incident_preprocessor.pkl')
print("\n✅ Model, encoder, and preprocessor saved.")


📊 Classification Report for Type of Injury:
                             precision    recall  f1-score   support

                          -       0.85      0.91      0.88        32
        Abrasion/Irritation       0.00      0.00      0.00         1
  Asphyxiation, suffocation       1.00      1.00      1.00         1
Burn (2nd, 3rd, 4th degree)       0.00      0.00      0.00         2
   Cut / Laceration / Wound       0.25      0.50      0.33         2
   Foreign body (ingressed)       0.00      0.00      0.00         1
                  Fractures       0.55      0.73      0.63        15
      Loss of consciousness       0.00      0.00      0.00         1
               Multi Injury       0.00      0.00      0.00         0
                      Other       0.00      0.00      0.00         1
                       Pain       0.50      0.20      0.29         5
            Strain / Sprain       0.00      0.00      0.00         1

                   accuracy                           0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Verify the contents of the encoded file

import pandas as pd
import joblib

# Load predicted encoded file
df_verify = pd.read_excel('Incident Predictions 2.xlsx')

# Load the label encoders that were used to encode the original training data
label_encoders = joblib.load('incident_label_encoders.pkl')

# Preview data
df_verify[['Predicted Type of Injury', 'Predicted Kind of event hazard']].head(10)

Unnamed: 0,Predicted Type of Injury,Predicted Kind of event hazard
0,0,0
1,0,22
2,0,2
3,0,2
4,0,2
5,0,4
6,0,22
7,0,21
8,0,0
9,0,21


In [7]:
# Check the available keys in encoder
print("Encoder keys:", label_encoders.keys())

# Check the mapping for Type of Injury
injury_encoder = label_encoders['Type of Injury']
print("\nType of Injury classes:", list(enumerate(injury_encoder.classes_)))

# Check the mapping for Kind of event hazard
hazard_encoder = label_encoders['Kind of event hazard']
print("\nKind of event hazard classes:", list(enumerate(hazard_encoder.classes_)))

Encoder keys: dict_keys(['Type of Injury', 'Kind of event hazard'])

Type of Injury classes: [(0, '-'), (1, 'Abrasion/Irritation'), (2, 'Amputation'), (3, 'Asphyxiation, suffocation'), (4, 'Bruising / Contusion'), (5, 'Burn (2nd, 3rd, 4th degree)'), (6, 'Concussion'), (7, 'Contact / Exposure (chemicals)'), (8, 'Crush'), (9, 'Cut / Laceration / Wound'), (10, 'Dislocation'), (11, 'Foreign body (ingressed)'), (12, 'Fractures'), (13, 'Inflammation'), (14, 'Irritation'), (15, 'Loss of consciousness'), (16, 'Multi Injury'), (17, 'Musculoskeletal Disorder'), (18, 'Other'), (19, 'Pain'), (20, 'Scald / Burn (1st degree)'), (21, 'Shock'), (22, 'Strain / Sprain')]

Kind of event hazard classes: [(0, '-'), (1, 'Chemical / Gas - Exposed to or in contact with'), (2, 'Chemical / Gas - Inhalation'), (3, 'Chemical hazards'), (4, 'Collision with Car, Truck, 2 wheels'), (5, 'Collision with industrial truck, mobile equipment'), (6, 'Confined Space'), (7, 'Crushing hazard'), (8, 'Cutting, puncturing hazard

In [19]:
# Decode a few entries manually to check
decoded_injury = injury_encoder.inverse_transform(df_verify['Predicted Type of Injury'].astype(int).head(10))
decoded_hazard = hazard_encoder.inverse_transform(df_verify['Predicted Kind of event hazard'].astype(int).head(10))

print("Decoded injuries:", decoded_injury)
print("Decoded hazards:", decoded_hazard)

Decoded injuries: ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
Decoded hazards: ['-' 'Vehicle - Car accident' 'Chemical / Gas - Inhalation'
 'Chemical / Gas - Inhalation' 'Chemical / Gas - Inhalation'
 'Collision with Car, Truck, 2 wheels' 'Vehicle - Car accident'
 'Vehicle - 2 or 3 wheelers accident' '-'
 'Vehicle - 2 or 3 wheelers accident']
