In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

#### Logistic Regression Implementation¶

To get a baseline, we will use all of the features after encoding the categorical variables. We will preprocess the data by filling in the missing values (imputation) and normalizing the range of the features (feature scaling). The following code performs both of these preprocessing steps.

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# 1️⃣ Median imputation for all features
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train),
                               columns=X_train.columns,
                               index=X_train.index)
X_test_imputed = pd.DataFrame(imputer.transform(X_test),
                              columns=X_test.columns,
                              index=X_test.index)

# 2️⃣ MinMax scaling for all features
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed),
                              columns=X_train.columns,
                              index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed),
                             columns=X_test.columns,
                             index=X_test.index)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


X_train_scaled shape: (7694, 155)
X_test_scaled shape: (1924, 155)


#### Logistic Regression

In [7]:
y_train

Unnamed: 0,Target
0,Low
1,Low
2,Low
3,High
4,Low
...,...
7689,Low
7690,Low
7691,Low
7692,Medium


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Map target to numeric
y_train_numeric = y_train['Target'].map({'Low': 0, 'Medium': 1, 'High': 2})
y_test_numeric = y_test['Target'].map({'Low': 0, 'Medium': 1, 'High': 2})

# Create logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Fit model
model.fit(X_train_scaled, y_train_numeric)

# Predict on test set
y_pred = model.predict(X_test_scaled)

# Evaluate
print("Classification Report:")
print(classification_report(y_test_numeric, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test_numeric, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      1256
           1       0.86      0.67      0.76       574
           2       0.82      0.56      0.67        94

    accuracy                           0.87      1924
   macro avg       0.85      0.74      0.78      1924
weighted avg       0.86      0.87      0.86      1924

Confusion Matrix:
[[1225   30    1]
 [ 176  387   11]
 [   7   34   53]]


In [12]:
from sklearn.metrics import log_loss

# Make probability predictions
y_pred_proba = model.predict_proba(X_test_scaled)

# Compute log loss
loss = log_loss(y_test_numeric, y_pred_proba)
print("Log Loss:", loss)

Log Loss: 0.3393970206157056


In [None]:
import re
import pandas as pd
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb



# Fixed parameters (no early stopping in RF mode)
fixed_params = {
    'eval_metric': 'multi_logloss',
    #'verbose': 100
}

num_random_iters = 20
num_cv_folds = 5

# Parameter distributions for LightGBM RF
rf_params = {
    'num_leaves': randint(5, 50),
    'subsample': uniform(0.4, 0.55),        # 0.4 to <1.0
    'subsample_freq': randint(1, 10),
    'colsample_bytree': uniform(0.4, 0.55), # 0.4 to <1.0 (essential for RF)
    'max_depth': randint(3, 20)
}

ens = lgb.LGBMClassifier(
    boosting_type='rf',
    n_estimators=2000,     # Large number since no early stopping
    max_depth=-1,
    random_state=42,
    n_jobs=-1
)

cv = RandomizedSearchCV(
    estimator=ens,
    param_distributions=rf_params,
    n_iter=num_random_iters,
    cv=num_cv_folds,
    scoring='neg_log_loss',
    refit=True,
    random_state=42,
    verbose=True
)

cv.fit(X_train, y_train, **fixed_params)

print("Best parameters:", cv.best_params_)
print("Best CV score:", cv.best_score_)

# Evaluate best model on test set
best_model = cv.best_estimator_
test_predictions = best_model.predict_proba(X_test)
# Or predict for classes: best_model.predict(X_test_encoded)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1133
[LightGBM] [Info] Number of data points in the train set: 6155, number of used features: 129
[LightGBM] [Info] Start training from score -3.017910
[LightGBM] [Info] Start training from score -0.426232
[LightGBM] [Info] Start training from score -1.210220
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1134
[LightGBM] [Info] Number of data points in the train set: 6155, number of used features: 129
[LightGBM] [Info] Start training from score -3.017910
[LightGBM] [Info