In [1]:
import joblib
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV

# 1.Loading the Train,Test Datasets

In [2]:
train_data = joblib.load(r"C:\Users\Lenovo\project 4\Encoded_Train_Data.joblib")
test_data = joblib.load(r"C:\Users\Lenovo\project 4\Encoded_Test_Data.joblib")

# 2.Spliting

In [3]:
# Defining the target column from the data
target_column = 'IncidentGrade'

#spliting train data
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]

#spliting test data
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

#shapes of the data
print(f"Training Features Shape: {X_train.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Test Features Shape: {X_test.shape}")
print(f"Test Target Shape: {y_test.shape}")

Training Features Shape: (34600, 79)
Training Target Shape: (34600,)
Test Features Shape: (34494, 79)
Test Target Shape: (34494,)


# 3.Model Training

In [4]:
#spliting train data further into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Train and evaluate each model
model_accuracies = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    model_accuracies[model_name] = accuracy
    print(f"{model_name} Accuracy: {accuracy:.4f}")

# Identifing the best model
best_model_name = max(model_accuracies, key=model_accuracies.get)
print(f"\nBest Model: {best_model_name} with Accuracy: {model_accuracies[best_model_name]:.4f}")

Logistic Regression Accuracy: 0.6509
Decision Tree Accuracy: 0.6743
Random Forest Accuracy: 0.6756
Gradient Boosting Accuracy: 0.6712
SVM Accuracy: 0.6952
KNN Accuracy: 0.6694

Best Model: SVM with Accuracy: 0.6952


# 3.1.Saving the Best Model using Joblib

In [5]:
best_model = models[best_model_name]
joblib.dump(best_model, f"{best_model_name}_Model.joblib")
print(f"Best model saved as {best_model_name}_Model.joblib")

Best model saved as SVM_Model.joblib


# 4.Loading the saved SVM model

In [6]:
SVM_model = joblib.load('SVM_Model.joblib')

# 5.Evaluate Performance on Validation Set

In [7]:
# Predict on validation data
val_predictions = SVM_model.predict(X_val)

#creating performance metrics
print("Classification Report:")
print(classification_report(y_val, val_predictions, target_names=['TP', 'BP', 'FP']))

#calculating the individual metrics
macro_f1 = f1_score(y_val, val_predictions, average='macro')
precision = precision_score(y_val, val_predictions, average='macro')
recall = recall_score(y_val, val_predictions, average='macro')

print(f"Macro-F1 Score: {macro_f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Classification Report:
              precision    recall  f1-score   support

          TP       0.65      0.91      0.76      3014
          BP       0.69      0.39      0.50      1430
          FP       0.79      0.61      0.69      2476

    accuracy                           0.70      6920
   macro avg       0.71      0.64      0.65      6920
weighted avg       0.71      0.70      0.68      6920

Macro-F1 Score: 0.6486822902053254
Precision: 0.7110703560222132
Recall: 0.6363493704743425


# 6.Hyperparameter Tuning using RandomizedSearchCV

In [8]:
from sklearn.svm import SVC

# Example of defining the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [3, 4, 5]  # Degree for polynomial kernel
}

# Define the SVM model (if not already defined)
SVM_model = SVC()

# Initialize the RandomizedSearchCV with the correct parameter grid
random_search = RandomizedSearchCV(
    SVM_model,
    param_distributions=param_grid,
    n_iter=20,  # Number of combinations to try
    scoring='f1_macro',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Model fitting
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Macro-F1 Score:", random_search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 5, 'C': 10}
Best Macro-F1 Score: 0.6371333386291709


# 6.1.Saving the Best Tuned Model using Joblib

In [10]:
joblib.dump(random_search.best_estimator_, 'Tuned_SVM_Model_RandomizedSearch.joblib')
print("Best tuned model saved as 'Tuned_SVM_Model_RandomizedSearch.joblib'")

Best tuned model saved as 'Tuned_SVM_Model_RandomizedSearch.joblib'


# 7.Loading the Tuned Model

In [11]:
tuned_model = joblib.load(r"C:\Users\Lenovo\project 4\Tuned_SVM_Model_RandomizedSearch.joblib")

# 8.Evaluate on Validation Set

In [12]:
#predicting on the validation data
val_predictions = tuned_model.predict(X_val)

# Generate performance metrics
print("Validation Set Classification Report:")
print(classification_report(y_val, val_predictions, target_names=['TP', 'BP', 'FP']))

#calculating individual metrics
macro_f1 = f1_score(y_val, val_predictions, average='macro')
precision = precision_score(y_val, val_predictions, average='macro')
recall = recall_score(y_val, val_predictions, average='macro')

print(f"Macro-F1 Score (Validation): {macro_f1}")
print(f"Precision (Validation): {precision}")
print(f"Recall (Validation): {recall}")

Validation Set Classification Report:
              precision    recall  f1-score   support

          TP       0.69      0.82      0.75      3014
          BP       0.63      0.45      0.53      1430
          FP       0.73      0.67      0.70      2476

    accuracy                           0.69      6920
   macro avg       0.68      0.65      0.66      6920
weighted avg       0.69      0.69      0.68      6920

Macro-F1 Score (Validation): 0.6576224139008566
Precision (Validation): 0.6802794863967948
Recall (Validation): 0.6488417857078165


# 9.Final Evaluation on Test Set

In [13]:
#now predicting with the test data
test_predictions = tuned_model.predict(X_test)

#generate performance metrics
print("Test Set Classification Report:")
print(classification_report(y_test, test_predictions, target_names=['TP', 'BP', 'FP']))

# Calculate individual metrics
macro_f1_test = f1_score(y_test, test_predictions, average='macro')
precision_test = precision_score(y_test, test_predictions, average='macro')
recall_test = recall_score(y_test, test_predictions, average='macro')

print(f"Macro-F1 Score (Test): {macro_f1_test}")
print(f"Precision (Test): {precision_test}")
print(f"Recall (Test): {recall_test}")

Test Set Classification Report:
              precision    recall  f1-score   support

          TP       0.67      0.80      0.73     14874
          BP       0.54      0.40      0.46      6630
          FP       0.71      0.65      0.68     12990

    accuracy                           0.67     34494
   macro avg       0.64      0.61      0.62     34494
weighted avg       0.66      0.67      0.66     34494

Macro-F1 Score (Test): 0.6211796039888307
Precision (Test): 0.6394621297730483
Recall (Test): 0.6147857455515605
