# Import the Dataset and Explore the Data <a name="import-the-dataset-and-explore-the-data"></a>


## 1.1 Importing Libraries <a name="11-importing-libraries"></a>


In [1]:
!pip install xgboost



In [2]:
!pip install lightgbm



In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

%config InlineBackend.figure_format = 'retina'

## Loading the previous Datasets <a name="12-loading-and-reading-the-dataset"></a>


In [4]:
X_train = pd.read_csv('X_train_nb2.csv', index_col=0)
X_val = pd.read_csv('X_val_nb2.csv', index_col=0)
X_test = pd.read_csv('X_test_nb2.csv', index_col=0)
X_test_original = pd.read_csv('test_data.csv', index_col=0)

In [5]:
y_train = pd.read_csv("y_train_nb2.csv", index_col=0)
y_val = pd.read_csv("y_val.csv", index_col=0)
#uploading it as a 1-column dataframe while preserving its index, to make sure it aligns

In [6]:
y_train = y_train.iloc[:, 0]
y_val = y_val.iloc[:, 0]

In [7]:
def export_predictions(y_pred_test, X_test_original, filename="final_submission.csv"):
    
    # Convert predictions to a pandas Series and ensure integer type
    y_pred_test = pd.Series(y_pred_test)

    y_pred_test = y_pred_test.astype(int)
    
    # Replace prediction codes with their corresponding labels
    y_pred_test = y_pred_test.replace({
        1: '1. CANCELLED',
        2: '2. NON-COMP',
        3: '3. MED ONLY',
        4: '4. TEMPORARY',
        5: '5. PPD SCH LOSS',
        6: '6. PPD NSL',
        7: '7. PTD',
        8: '8. DEATH'
    })
    
    X_test_original = X_test_original.reset_index(drop=True)
    y_pred_test = y_pred_test.reset_index(drop=True)
    
    # Combine 'Claim Identifier' and predictions into a new DataFrame
    submission = pd.DataFrame({
        'Claim Identifier': X_test_original['Claim Identifier'],
        'Claim Injury Type': y_pred_test
    })
    
    # Export to CSV
    submission.to_csv(filename, index=False)
    print(f"File exported successfully as {filename}")

# 6. Model Assessment and Selection

## 6.1 Random Forest

In [8]:
# Define the RandomForestClassifier model
rf_model = RandomForestClassifier(
    n_estimators=300,  # Increase the number of trees
    max_depth=20,  # Limit the maximum depth of the trees
    min_samples_split=5,  # Set the minimum number of samples required to split a node
    min_samples_leaf=2,  # Set the minimum number of samples in a leaf
    max_features='sqrt',  # Limit the number of features considered at each split
    class_weight='balanced',  # Adjust weights to handle imbalanced classes
    random_state=42
)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf_model.predict(X_val)

# Generate classification report
class_report = classification_report(y_val, y_pred, output_dict=True)

# Extract only precision, recall, and F1-score
print("Precision, Recall, and F1-Score for each class:")
for label, metrics in class_report.items():
    if label.isdigit():  # Ensure we only print class-related metrics
        print(f"Class {label}:")
        print(f"  Precision: {metrics['precision']:.2f}")
        print(f"  Recall: {metrics['recall']:.2f}")
        print(f"  F1-Score: {metrics['f1-score']:.2f}")
        
test_results = rf_model.predict(X_test)

Precision, Recall, and F1-Score for each class:
Class 1:
  Precision: 0.36
  Recall: 0.63
  F1-Score: 0.45
Class 2:
  Precision: 0.86
  Recall: 0.86
  F1-Score: 0.86
Class 3:
  Precision: 0.31
  Recall: 0.23
  F1-Score: 0.27
Class 4:
  Precision: 0.75
  Recall: 0.60
  F1-Score: 0.66
Class 5:
  Precision: 0.51
  Recall: 0.79
  F1-Score: 0.62
Class 6:
  Precision: 0.11
  Recall: 0.33
  F1-Score: 0.17
Class 7:
  Precision: 0.02
  Recall: 0.07
  F1-Score: 0.03
Class 8:
  Precision: 0.13
  Recall: 0.66
  F1-Score: 0.22


In [9]:
rf_precision = precision_score(y_val, y_pred, average='macro').round(4)
rf_precision

0.3804

In [10]:
rf_recall = recall_score(y_val, y_pred, average='macro').round(4)
rf_recall

0.5205

In [11]:
rf_f1 = f1_score(y_val, y_pred, average='macro').round(4)
rf_f1

0.4095

In [12]:
export_predictions(test_results, X_test_original, "rf_model.csv")

File exported successfully as rf_model.csv


## 6.2 Logistic Regression

In [13]:
log_model = LogisticRegression(
    solver='saga',         # Efficient solver for large datasets
    multi_class='multinomial',  # Better for multi-class problems
    penalty='l2',          # Use L2 regularization
    C=1.0,                 # Default regularization strength
    max_iter=500,          # Increase max iterations
    class_weight='balanced',  # Handle class imbalance
    n_jobs=-1,             # Use all CPU cores for computation
    random_state=42,       # Reproducibility
    verbose=1              # Monitor training progress
)

In [14]:
log_model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


convergence after 64 epochs took 20 seconds


In [15]:
y_pred_train = log_model.predict(X_train)
y_pred = log_model.predict(X_val)

In [16]:
log_precision = precision_score(y_val, y_pred, average='macro').round(4)
log_precision

0.2877

In [17]:
log_recall = recall_score(y_val, y_pred, average='macro').round(4)
log_recall

0.5224

In [18]:
log_f1 = f1_score(y_val, y_pred, average='macro').round(4)
log_f1

0.275

In [21]:
test_results = log_model.predict(X_test)

In [22]:
export_predictions(test_results, X_test_original, "log_model.csv")

File exported successfully as log_model.csv


## 6.3 Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Define the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dt_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = dt_model.predict(X_val)

# Generate classification report for Decision Tree
class_report_dt = classification_report(y_val, y_pred, output_dict=True)

# Initialize variables to calculate the mean of precision, recall, and F1-score
precision_sum, recall_sum, f1_score_sum, class_count = 0, 0, 0, 0

print("Precision, Recall, and F1-Score for each class (Decision Tree):")
for label, metrics in class_report_dt.items():
    if label.isdigit():  # Ensure we only process metrics for classes
        precision_sum += metrics['precision']
        recall_sum += metrics['recall']
        f1_score_sum += metrics['f1-score']
        class_count += 1
        print(f"Class {label}:")
        print(f"  Precision: {metrics['precision']:.2f}")
        print(f"  Recall: {metrics['recall']:.2f}")
        print(f"  F1-Score: {metrics['f1-score']:.2f}")
        
test_results = dt_model.predict(X_test)

Precision, Recall, and F1-Score for each class (Decision Tree):
Class 1:
  Precision: 0.26
  Recall: 0.49
  F1-Score: 0.34
Class 2:
  Precision: 0.85
  Recall: 0.74
  F1-Score: 0.79
Class 3:
  Precision: 0.18
  Recall: 0.31
  F1-Score: 0.23
Class 4:
  Precision: 0.69
  Recall: 0.51
  F1-Score: 0.59
Class 5:
  Precision: 0.47
  Recall: 0.57
  F1-Score: 0.52
Class 6:
  Precision: 0.07
  Recall: 0.19
  F1-Score: 0.11
Class 7:
  Precision: 0.00
  Recall: 0.00
  F1-Score: 0.00
Class 8:
  Precision: 0.14
  Recall: 0.37
  F1-Score: 0.21


In [24]:
dt_precision = round(precision_sum / class_count, 4)
dt_precision

0.3349

In [25]:
dt_recall = round(recall_sum / class_count, 4)
dt_recall

0.3986

In [26]:
dt_f1 = round(f1_score_sum / class_count, 4)
dt_f1

0.3484

In [27]:
export_predictions(test_results, X_test_original, "dt_model.csv")

File exported successfully as dt_model.csv


## 6.4 Naive Bayes

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Define the Naive Bayes model (Gaussian Naive Bayes for continuous features)
nb_model = GaussianNB()

# Fit the model to the training data
nb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = nb_model.predict(X_val)

# Generate classification report for Naive Bayes
class_report_nb = classification_report(y_val, y_pred, output_dict=True)

# Initialize variables to calculate the mean of precision, recall, and F1-score
precision_sum, recall_sum, f1_score_sum, class_count = 0, 0, 0, 0

print("Precision, Recall, and F1-Score for each class (Naive Bayes):")
for label, metrics in class_report_nb.items():
    if label.isdigit():  # Ensure we only process metrics for classes
        precision_sum += metrics['precision']
        recall_sum += metrics['recall']
        f1_score_sum += metrics['f1-score']
        class_count += 1
        print(f"Class {label}:")
        print(f"  Precision: {metrics['precision']:.2f}")
        print(f"  Recall: {metrics['recall']:.2f}")
        print(f"  F1-Score: {metrics['f1-score']:.2f}")
        
test_results = nb_model.predict(X_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision, Recall, and F1-Score for each class (Naive Bayes):
Class 1:
  Precision: 0.14
  Recall: 0.52
  F1-Score: 0.22
Class 2:
  Precision: 0.75
  Recall: 0.81
  F1-Score: 0.78
Class 3:
  Precision: 0.26
  Recall: 0.08
  F1-Score: 0.13
Class 4:
  Precision: 0.46
  Recall: 0.10
  F1-Score: 0.16
Class 5:
  Precision: 0.47
  Recall: 0.27
  F1-Score: 0.35
Class 6:
  Precision: 0.00
  Recall: 0.00
  F1-Score: 0.00
Class 7:
  Precision: 0.00
  Recall: 0.97
  F1-Score: 0.00
Class 8:
  Precision: 0.01
  Recall: 0.44
  F1-Score: 0.03


In [29]:
nb_precision = round(precision_sum / class_count, 4)
nb_precision

0.2617

In [30]:
nb_recall = round(recall_sum / class_count, 4)
nb_recall

0.3976

In [31]:
nb_f1 = round(f1_score_sum / class_count, 4)
nb_f1

0.2069

In [32]:
export_predictions(test_results, X_test_original, "nb_model.csv")

File exported successfully as nb_model.csv


## 6.5 Neural Networks

In [33]:
# Define the model with your specifications
nn_model = MLPClassifier(
    hidden_layer_sizes=(int(X_train.shape[1] * 0.8), int(X_train.shape[1] * 0.5)),  # 2 hidden layers with fewer neurons than features
    activation='relu',  # ReLU activation function
    solver='adam',      # Adam optimizer
    learning_rate_init=0.01,  # Initial learning rate of 0.01
    max_iter=500,       # Maximum number of iterations
    random_state=42     # Ensures reproducibility
)

# Train the model with the scaled training data
nn_model.fit(X_train, y_train)

# Evaluate on the validation set
y_pred = nn_model.predict(X_val)  # Assuming X_val is already scaled

# Calculate F1-score and generate classification report
f1 = f1_score(y_val, y_pred, average='weighted')
class_report = classification_report(y_val, y_pred)

# Print results
print(f"F1 Score (Weighted): {f1:.2f}")
print("Classification Report for Validation Set:\n", class_report)

test_results = nn_model.predict(X_test)

F1 Score (Weighted): 0.61
Classification Report for Validation Set:
               precision    recall  f1-score   support

           1       0.24      0.49      0.32      3743
           2       0.79      0.85      0.82     87324
           3       0.29      0.13      0.18     20672
           4       0.64      0.38      0.48     44552
           5       0.46      0.57      0.51     14484
           6       0.05      0.53      0.10      1263
           7       0.00      0.34      0.01        29
           8       0.08      0.72      0.15       141

    accuracy                           0.61    172208
   macro avg       0.32      0.50      0.32    172208
weighted avg       0.65      0.61      0.61    172208



In [34]:
nn_precision = precision_score(y_val, y_pred, average='macro').round(4)
nn_precision

0.3197

In [35]:
nn_recall = recall_score(y_val, y_pred, average='macro').round(4)
nn_recall

0.5034

In [36]:
nn_f1 = f1_score(y_val, y_pred, average='macro').round(4)
nn_f1

0.3208

In [37]:
export_predictions(test_results, X_test_original, "nn_model.csv")

File exported successfully as nn_model.csv


## 6.6 Light GBM

In [91]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform y_train and y_val
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Create LightGBM Datasets with encoded classes
train_data = lgb.Dataset(X_train, label=y_train_encoded)
val_data = lgb.Dataset(X_val, label=y_val_encoded, reference=train_data)

# Model hyperparameters
params = {
    'objective': 'multiclass',
    'num_class': len(label_encoder.classes_),  # Use the actual number of classes
    'metric': 'multi_logloss',  # Evaluation metric
    'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Configure early stopping as a callback
callbacks = [lgb.early_stopping(100)]  # Stop after 100 rounds with no improvement

# Train the model with early stopping callback
light_model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1000,
    callbacks=callbacks  # Pass the early stopping callback
)

# Predict probabilities for the validation set
y_pred_prob = light_model.predict(X_val, num_iteration=light_model.best_iteration)

# Convert probabilities to predicted classes
y_pred_classes = [list(x).index(max(x)) for x in y_pred_prob]
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_val_labels = label_encoder.inverse_transform(y_val_encoded)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_val_labels, y_pred_labels))

test_results_prob = light_model.predict(X_test, num_iteration=light_model.best_iteration)

# Convert probabilities to predicted class indices
test_results_classes = [list(x).index(max(x)) for x in test_results_prob]

# Convert class indices back to original labels using the LabelEncoder
test_results = label_encoder.inverse_transform(test_results_classes)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[610]	training's multi_logloss: 0.478656	valid_1's multi_logloss: 0.627269
Classification Report:
              precision    recall  f1-score   support

           1       0.56      0.55      0.56      3743
           2       0.86      0.94      0.90     87324
           3       0.37      0.21      0.27     20672
           4       0.77      0.73      0.75     44552
           5       0.58      0.75      0.65     14484
           6       0.15      0.08      0.10      1263
           7       0.14      0.03      0.06        29
           8       0.38      0.41      0.39       141

    accuracy                           0.77    172208
   macro avg       0.48      0.46      0.46    172208
weighted avg       0.74      0.77      0.75    172208



In [92]:
light_precision = precision_score(y_val_encoded, y_pred_classes, average='macro').round(4)
light_precision

0.477

In [93]:
light_recall = recall_score(y_val_encoded, y_pred_classes, average='macro').round(4)
light_recall

0.4627

In [94]:
light_f1 = f1_score(y_val_encoded, y_pred_classes, average='macro').round(4)
light_f1

0.4599

In [95]:
test_results.shape

(387975,)

In [96]:
export_predictions(test_results, X_test_original, "light_model.csv")

File exported successfully as light_model.csv


## 6.7 XGBoost

In [71]:
# 1. Reindex classes to start from 0
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

# 2. Create the XGBoost model
xg_model = xgb.XGBClassifier(
    objective='multi:softmax',  # For multiclass classification
    num_class=8,                # Number of classes
    eval_metric='mlogloss',     # Evaluation metric
    use_label_encoder=False     # Disable XGBoost's label encoder
)

# 3. Train the model
xg_model.fit(X_train, y_train_encoded)

# 4. Make predictions on the validation set
y_pred_encoded = xg_model.predict(X_val)

# 5. Decode predictions back to the original labels
y_pred = le.inverse_transform(y_pred_encoded)

# 6. Evaluate the model

# Classification report (precision, recall, f1-score for each class)
print("\nClassification Report:")
print(classification_report(y_val_encoded, y_pred_encoded))

# Confusion matrix
cm = confusion_matrix(y_val_encoded, y_pred_encoded)
print("\nConfusion Matrix:")
print(cm)

test_results = xg_model.predict(X_test)

replacement_map = {i: i + 1 for i in range(8)}

test_results = [replacement_map[val] for val in test_results]

Parameters: { "use_label_encoder" } are not used.




Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.53      0.53      3743
           1       0.86      0.94      0.90     87324
           2       0.36      0.20      0.25     20672
           3       0.77      0.71      0.74     44552
           4       0.57      0.75      0.65     14484
           5       0.16      0.15      0.15      1263
           6       0.06      0.07      0.06        29
           7       0.27      0.52      0.36       141

    accuracy                           0.76    172208
   macro avg       0.45      0.48      0.46    172208
weighted avg       0.74      0.76      0.75    172208


Confusion Matrix:
[[ 1999  1423   247    43    21     1     0     9]
 [ 1427 82451  2732   452   202    13     0    47]
 [  170  9881  4048  4963  1494    80     5    31]
 [  152  2127  3669 31490  6245   750    20    99]
 [    9    92   454  2957 10827   136     2     7]
 [    0     0    30   770   272   184     4     3]


In [72]:
xg_precision = precision_score(y_val_encoded, y_pred_encoded, average='macro').round(4)
xg_precision

0.4479

In [73]:
xg_recall = recall_score(y_val_encoded, y_pred_encoded, average='macro').round(4)
xg_recall

0.4835

In [74]:
xg_f1 = f1_score(y_val_encoded, y_pred_encoded, average='macro').round(4)
xg_f1

0.4556

In [75]:
export_predictions(test_results, X_test_original, "xg_model.csv")

File exported successfully as xg_model.csv


## Voting Classifier

It was tested the Voting Classifier in an attempt to improve the model, but the score was very low.

In [50]:
from sklearn.preprocessing import LabelEncoder

# 1. Apply LabelEncoder to reindex labels to start from 0
le = LabelEncoder()

# Fit the encoder on y_train and transform y_train
y_train_encoded = le.fit_transform(y_train)

# Now check the unique labels in y_train_encoded (should start from 0)
print(f"Unique labels after encoding: {np.unique(y_train_encoded)}")

# 2. Fit the XGBoost model with the encoded labels
if not hasattr(xg_model, 'booster_'):
    xg_model.fit(X_train, y_train_encoded)  # Fit using the encoded labels

# 3. Now predict probabilities with the fitted model
proba_rf = rf_model.predict_proba(X_val)  # Random Forest probabilities
proba_lr = log_model.predict_proba(X_val)  # Logistic Regression probabilities
proba_xgb = xg_model.predict_proba(X_val)  # XGBoost probabilities

# 4. Combine the probabilities using Soft Voting (average of probabilities)
soft_voting_proba = (proba_rf + proba_lr + proba_xgb) / 3  # Average probabilities

# 5. Final prediction: class with highest summed probability
soft_voting_preds = np.argmax(soft_voting_proba, axis=1)

# 6. Decode the predictions back to the original labels
y_pred = le.inverse_transform(soft_voting_preds)

# 7. Evaluate the model (classification report without accuracy)
print("Classification Report (Soft Voting):")
print(classification_report(y_val, y_pred))

# Optional: F1-score (macro average)
f1 = f1_score(y_val, y_pred, average='macro')
print(f"F1-Score (Macro Average): {f1:.4f}")

Unique labels after encoding: [0 1 2 3 4 5 6 7]


Parameters: { "use_label_encoder" } are not used.



Classification Report (Soft Voting):
              precision    recall  f1-score   support

           1       0.36      0.61      0.46      3743
           2       0.87      0.90      0.88     87324
           3       0.34      0.21      0.26     20672
           4       0.77      0.62      0.69     44552
           5       0.51      0.78      0.62     14484
           6       0.12      0.30      0.17      1263
           7       0.01      0.14      0.02        29
           8       0.10      0.73      0.18       141

    accuracy                           0.72    172208
   macro avg       0.39      0.54      0.41    172208
weighted avg       0.73      0.72      0.72    172208

F1-Score (Macro Average): 0.4099


In [51]:
# Data for the table
data = {
    'Model': [
        'Random Forest', 'Logistic Regression', 'Decision Tree', 
        'Naive Bayes', 'Neural Network', 'LightGBM', 'XGBoost'
    ],
    'Precision': [
        rf_precision, log_precision, dt_precision, nb_precision, nn_precision, 
        light_precision, xg_precision
    ],
    'Recall': [
        rf_recall, log_recall, dt_recall, nb_recall, nn_recall, 
        light_recall, xg_recall
    ],
    'F1-Score': [
        rf_f1, log_f1, dt_f1, nb_f1, nn_f1, 
        light_f1, xg_f1
    ]
}

# Creating the DataFrame
df = pd.DataFrame(data)

# Displaying the DataFrame
df

Unnamed: 0,Model,Precision,Recall,F1-Score
0,Random Forest,0.3804,0.5205,0.4095
1,Logistic Regression,0.2877,0.5224,0.275
2,Decision Tree,0.3349,0.3986,0.3484
3,Naive Bayes,0.2617,0.3976,0.2069
4,Neural Network,0.3197,0.5034,0.3208
5,LightGBM,0.477,0.4627,0.4599
6,XGBoost,0.4479,0.4835,0.4556


# Model Optimization

In addition to a few other models that we thought would provide superior performance, we tested five baseline models. We chose the two models with the greatest scores for additional optimization after analyzing the outcomes. We used cross-validation and grid search to improve their performance, which helped us optimize the hyperparameters and find the ideal values. By using this method, we were able to maximize the accuracy and overall performance of the model and attain better outcomes.

## Random Forest

In [52]:
from scipy.stats import randint
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split

# Stratified sampling of 10% from resampled data
X_train_sample, _, y_train_sample, _ = train_test_split(
    X_train, y_train, test_size=0.9, stratify=y_train, random_state=42
)

# Define the RandomForestClassifier model
model = RandomForestClassifier(random_state=42)

# Set up the Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define a randomized hyperparameter grid
param_distributions = {
    'n_estimators': randint(100, 500),       # Test between 100-500 trees
    'max_depth': [10, 20, 30, None],         # Range for tree depth
    'min_samples_split': [2, 5, 10],         # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],           # Minimum samples in a leaf node
    'max_features': ['sqrt', 'log2'],        # Feature selection strategies
    'bootstrap': [True, False],              # Use bootstrap sampling
    'class_weight': ['balanced']             # Handle class imbalance
}

# Set up the RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=50,  # Test 50 random combinations
    scoring='f1_macro',  # Optimize for macro-averaged F1 score
    cv=skf,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Fit the RandomizedSearchCV to the sampled data (10%)
random_search.fit(X_train_sample, y_train_sample)

# Get the best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Print the best hyperparameters
print(f"Best Hyperparameters: {random_search.best_params_}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 121}


In [53]:
# Best hyperparameters from RandomizedSearchCV
best_params = {
    'bootstrap': False,
    'class_weight': 'balanced',
    'max_depth': None,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 121
}

# Create the model with the best hyperparameters
grid_rf_model = RandomForestClassifier(random_state=42, **best_params)

# Train the model on the full training data
grid_rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = grid_rf_model.predict(X_val)

# Evaluate the model
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

test_results = grid_rf_model.predict(X_test)

Confusion Matrix:
[[ 2126  1081   322   126    69     3     0    16]
 [ 2044 76661  6458  1578   495    16     0    72]
 [  238  8650  4933  4925  1752   126     2    46]
 [  213  2856  4367 28969  6941  1055     6   145]
 [   17    98   525  2757 10880   192     1    14]
 [    0     1    39   719   283   215     1     5]
 [    0     0     0    17     5     5     1     1]
 [    5     7    20    22     2     3     2    80]]

Classification Report:
              precision    recall  f1-score   support

           1       0.46      0.57      0.51      3743
           2       0.86      0.88      0.87     87324
           3       0.30      0.24      0.26     20672
           4       0.74      0.65      0.69     44552
           5       0.53      0.75      0.62     14484
           6       0.13      0.17      0.15      1263
           7       0.08      0.03      0.05        29
           8       0.21      0.57      0.31       141

    accuracy                           0.72    172208
   macr

In [54]:
grid_rf_precision = precision_score(y_val, y_pred, average='macro').round(4)
grid_rf_precision

0.4133

In [55]:
grid_rf_recall = recall_score(y_val, y_pred, average='macro').round(4)
grid_rf_recall

0.4823

In [56]:
grid_rf_f1 = f1_score(y_val, y_pred, average='macro').round(4)
grid_rf_f1

0.4325

In [57]:
export_predictions(test_results, X_test_original, "grid_rf_model.csv")

File exported successfully as grid_rf_model.csv


## LightGBM

In [58]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

# 1. Reduce dataset size to 10% for faster experimentation
X_train_sampled, _, y_train_sampled, _ = train_test_split(
    X_train, y_train, train_size=0.1, random_state=42, stratify=y_train
)

# 2. Reindex classes to start from 0
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_sampled)
y_val_encoded = le.transform(y_val)

# 3. Define the LightGBM model
model = LGBMClassifier(
    objective='multiclass', 
    num_class=len(le.classes_), 
    metric='multi_logloss',  # Log-loss is often used in multiclass classification
    verbose=-1              # Reduce verbosity
)

# 4. Define the parameter grid for GridSearch
param_grid = {
    'learning_rate': [0.05, 0.1],  # Learning rate options
    'n_estimators': [100, 150],    # Number of estimators (trees)
    'max_depth': [6, 10],          # Max depth of the trees
    'subsample': [0.8, 1.0],       # Subsampling ratio
    'colsample_bytree': [0.8]      # Feature subsampling ratio
}

# 5. GridSearchCV with 3-fold cross-validation (for faster search)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    scoring='f1_macro',  # Maximize the macro average F1-Score
    cv=3,                # 3-fold cross-validation to save time
    n_jobs=-1,           # Use all available processors
    verbose=1
)

# 6. Train the model with GridSearchCV
grid_search.fit(X_train_sampled, y_train_encoded)

# 7. Best parameter set found
print(f"Best parameters found: {grid_search.best_params_}")

# 8. Evaluate the model with the best parameters found
best_model = grid_search.best_estimator_

# 9. Make predictions on the validation set (assuming X_val and y_val are available)
y_pred_encoded = best_model.predict(X_val)

# 10. Evaluate the model

# Classification report
print("\nClassification Report:")
print(classification_report(y_val_encoded, y_pred_encoded))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters found: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 150, 'subsample': 0.8}

Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.50      0.49      3743
           1       0.85      0.93      0.89     87324
           2       0.31      0.18      0.23     20672
           3       0.75      0.68      0.72     44552
           4       0.55      0.71      0.62     14484
           5       0.13      0.16      0.15      1263
           6       0.01      0.03      0.02        29
           7       0.19      0.58      0.28       141

    accuracy                           0.74    172208
   macro avg       0.41      0.47      0.42    172208
weighted avg       0.72      0.74      0.73    172208



In [97]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, f1_score

# 1. Definir o modelo com os melhores parâmetros
best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 150,
    'subsample': 0.8
}

grid_light_model = LGBMClassifier(
    objective='multiclass',
    num_class=len(y_train.unique()),  # Garantir que todas as classes sejam consideradas
    metric='multi_logloss',
    verbose=-1,
    random_state=42,
    **best_params  # Usar os melhores parâmetros encontrados
)

# 2. Reindexar as classes no conjunto de treino completo
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

# 3. Treinar o modelo no conjunto de treino completo
grid_light_model.fit(X_train, y_train_encoded)

# 4. Fazer predições no conjunto de validação
y_pred_encoded = grid_light_model.predict(X_val)

# 5. Decodificar as predições para as classes originais
y_pred = le.inverse_transform(y_pred_encoded)

# Relatório de classificação completo
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

test_results_prob = light_model.predict(X_test, num_iteration=light_model.best_iteration)

# Convert probabilities to predicted class indices
test_results_classes = [list(x).index(max(x)) for x in test_results_prob]

# Convert class indices back to original labels using the LabelEncoder
test_results = label_encoder.inverse_transform(test_results_classes)


Classification Report:
              precision    recall  f1-score   support

           1       0.55      0.53      0.54      3743
           2       0.86      0.95      0.90     87324
           3       0.38      0.18      0.25     20672
           4       0.77      0.72      0.75     44552
           5       0.57      0.75      0.65     14484
           6       0.17      0.12      0.14      1263
           7       0.03      0.03      0.03        29
           8       0.29      0.49      0.36       141

    accuracy                           0.77    172208
   macro avg       0.45      0.47      0.45    172208
weighted avg       0.74      0.77      0.75    172208



In [98]:
grid_light_precision = precision_score(y_val, y_pred, average='macro').round(4)
grid_light_precision

0.4521

In [99]:
grid_light_recall = recall_score(y_val, y_pred, average='macro').round(4)
grid_light_recall

0.4727

In [100]:
grid_light_f1 = f1_score(y_val, y_pred, average='macro').round(4)
grid_light_f1

0.4524

In [101]:
export_predictions(test_results, X_test_original, "grid_light_model.csv")

File exported successfully as grid_light_model.csv


## XGBoost

In [64]:
# 1. Reduce dataset size to 10% for faster experimentation
X_train_sampled, _, y_train_sampled, _ = train_test_split(
    X_train, y_train, train_size=0.1, random_state=42, stratify=y_train
)

# 2. Reindex classes to start from 0
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_sampled)

# 3. Define the XGBoost model
model = xgb.XGBClassifier(
    objective='multi:softmax', 
    num_class=len(le.classes_), 
    use_label_encoder=False,
    tree_method='hist',   # Optimized for large datasets
    verbosity=0           # Reduce output verbosity
)

# 4. Define the parameter grid for GridSearch
param_grid = {
    'max_depth': [6, 10],         # Add more depth options for optimization
    'learning_rate': [0.1, 0.2, 0.3],  # Include smaller and larger learning rates
    'n_estimators': [100, 150],     # Test with more estimators
    'subsample': [0.8, 0.9],        # Subsampling ratio
    'colsample_bytree': [0.7, 0.8], # Feature subsampling ratio
    'gamma': [0, 0.1],         # Minimum loss reduction for further partitioning
    'reg_lambda': [1, 1.5],         # L2 regularization
    'reg_alpha': [0, 0.5]           # L1 regularization
}

# 5. GridSearchCV with 3-fold cross-validation (to save time and resources)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    scoring='f1_macro',  # Maximize macro average F1-Score
    cv=3,                # 3-fold for faster grid search
    n_jobs=-1,           # Use all processors
    verbose=1
)

# 6. Train the model with GridSearchCV
grid_search.fit(X_train_sampled, y_train_encoded)

# 7. Best parameter set found
print(f"Best parameters found: {grid_search.best_params_}")

# 8. Evaluate the model with the best parameters found
best_model = grid_search.best_estimator_

# 9. Make predictions on the validation set (assuming X_val and y_val are available)
y_pred_encoded = best_model.predict(X_val)

# 10. Decode predictions back to the original labels
y_pred = le.inverse_transform(y_pred_encoded)

# 11. Evaluate the model

# Classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 384 candidates, totalling 1152 fits
Best parameters found: {'colsample_bytree': 0.7, 'gamma': 0.1, 'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 150, 'reg_alpha': 0, 'reg_lambda': 1.5, 'subsample': 0.8}

Classification Report:
              precision    recall  f1-score   support

           1       0.49      0.50      0.50      3743
           2       0.86      0.91      0.88     87324
           3       0.29      0.22      0.25     20672
           4       0.75      0.65      0.69     44552
           5       0.55      0.69      0.61     14484
           6       0.13      0.18      0.15      1263
           7       0.02      0.07      0.04        29
           8       0.18      0.61      0.28       141

    accuracy                           0.73    172208
   macro avg       0.41      0.48      0.43    172208
weighted avg       0.72      0.73      0.72    172208



In [65]:
# 1. Reindex classes to start from 0
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

# 2. Create the XGBoost model with predefined parameters
grid_xg_model = xgb.XGBClassifier(
    objective='multi:softmax',  # For multiclass classification
    num_class=8,                # Number of classes
    eval_metric='mlogloss',     # Evaluation metric
    use_label_encoder=False,    # Disable XGBoost's label encoder
    colsample_bytree=0.7,       # Feature subsampling ratio
    gamma=0.1,                  # Minimum loss reduction for further partitioning
    learning_rate=0.3,          # Learning rate
    max_depth=6,                # Maximum tree depth
    n_estimators=150,           # Number of trees
    reg_alpha=0,                # L1 regularization
    reg_lambda=1.5,             # L2 regularization
    subsample=0.8               # Subsampling ratio
)

# 3. Train the grid_xg_model
grid_xg_model.fit(X_train, y_train_encoded)

# 4. Make predictions on the validation set
y_pred_encoded = grid_xg_model.predict(X_val)

# 5. Decode predictions back to the original labels
y_pred = le.inverse_transform(y_pred_encoded)

# 6. Evaluate the model

# Classification report (precision, recall, f1-score for each class)
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
print("\nConfusion Matrix:")
print(cm)

test_results = grid_xg_model.predict(X_test)

replacement_map = {i: i + 1 for i in range(8)}

test_results = [replacement_map[val] for val in test_results]

Parameters: { "use_label_encoder" } are not used.




Classification Report:
              precision    recall  f1-score   support

           1       0.54      0.54      0.54      3743
           2       0.86      0.94      0.90     87324
           3       0.36      0.20      0.26     20672
           4       0.77      0.71      0.74     44552
           5       0.57      0.75      0.65     14484
           6       0.17      0.13      0.15      1263
           7       0.06      0.03      0.04        29
           8       0.32      0.50      0.39       141

    accuracy                           0.76    172208
   macro avg       0.46      0.48      0.46    172208
weighted avg       0.74      0.76      0.75    172208


Confusion Matrix:
[[ 2026  1408   235    49    18     1     0     6]
 [ 1388 82324  2876   501   191     9     0    35]
 [  156  9739  4221  5013  1449    66     2    26]
 [  141  2027  3856 31746  6081   612     8    81]
 [   11    90   429  3048 10792   111     1     2]
 [    0     0    32   783   282   163     2     1]


In [66]:
grid_xg_precision = precision_score(y_val, y_pred, average='macro').round(4)
grid_xg_precision

0.4577

In [67]:
grid_xg_recall = recall_score(y_val, y_pred, average='macro').round(4)
grid_xg_recall

0.4766

In [68]:
grid_xg_f1 = f1_score(y_val, y_pred, average='macro').round(4)
grid_xg_f1

0.4592

In [69]:
export_predictions(test_results, X_test_original, "grid_xg_model.csv")

File exported successfully as grid_xg_model.csv


In [103]:
rf_kaggle = 0.32603
log_kaggle = 0.19487
dt_kaggle = 0.24308
nb_kaggle = 0.16755
nn_kaggle = 0.24528
light_kaggle = 0.41025
xg_kaggle = 0.39258
grid_rf_kaggle = 0.34096
grid_light_kaggle = 0.41025
grid_xg_kaggle = 0.40752

In [104]:
# Data for the table
data = {
    'Model': [
        'Random Forest', 'Logistic Regression', 'Decision Tree', 
        'Naive Bayes', 'Neural Network', 'LightGBM', 'XGBoost',
        'Grid Search Random Forest', 'Grid Search LightGBM', 'Grid Search XGBoost'
    ],
    'Precision': [
        rf_precision, log_precision, dt_precision, nb_precision, nn_precision, 
        light_precision, xg_precision,
        grid_rf_precision, grid_light_precision, grid_xg_precision
    ],
    'Recall': [
        rf_recall, log_recall, dt_recall, nb_recall, nn_recall, 
        light_recall, xg_recall,
        grid_rf_recall, grid_light_recall, grid_xg_recall
    ],
    'F1-Score': [
        rf_f1, log_f1, dt_f1, nb_f1, nn_f1, 
        light_f1, xg_f1,
        grid_rf_f1, grid_light_f1, grid_xg_f1
    ],
    'Kaggle Score': [
        rf_kaggle, log_kaggle, dt_kaggle, nb_kaggle,
        nn_kaggle, light_kaggle, xg_kaggle, grid_rf_kaggle,
        grid_light_kaggle, grid_xg_kaggle
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Model,Precision,Recall,F1-Score,Kaggle Score
0,Random Forest,0.3804,0.5205,0.4095,0.32603
1,Logistic Regression,0.2877,0.5224,0.275,0.19487
2,Decision Tree,0.3349,0.3986,0.3484,0.24308
3,Naive Bayes,0.2617,0.3976,0.2069,0.16755
4,Neural Network,0.3197,0.5034,0.3208,0.24528
5,LightGBM,0.477,0.4627,0.4599,0.41025
6,XGBoost,0.4479,0.4835,0.4556,0.39258
7,Grid Search Random Forest,0.4133,0.4823,0.4325,0.34096
8,Grid Search LightGBM,0.4521,0.4727,0.4524,0.41025
9,Grid Search XGBoost,0.4577,0.4766,0.4592,0.40752


# 7.Final Prediction

Our best model is `LightGBM`, based on all metrics, having the best *Precision* and best *F1-Score Macro*

In [105]:
best_model = light_model

In [107]:
# Save the trained model to a file
joblib.dump(grid_rf_model, 'model.pkl')

print("Model successfully exported as model.pkl")

Model successfully exported as model.pkl
