In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

In [16]:
# Load the dataset
VT = pd.read_csv("data/processed_VT_Data.csv")
pd.set_option('display.max_columns', None)

# Separate features and target variable
X = VT.drop(columns=["Training_Program", "surgeries", "chronic_conditions"])
y = VT["Training_Program"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12, stratify=y)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Generate synthetic data with SMOTE
smote = SMOTE(random_state=12)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Original Training Data Distribution:")
print(pd.Series(y_train).value_counts())
print("\nTraining Data Distribution After SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Define the Logistic Regression model
logreg_model = LogisticRegression(max_iter=500, random_state=12)

# Define hyperparameters for optimization
param_dist = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'multi_class': ['ovr', 'multinomial'],
    'class_weight': [None, 'balanced']
}

# Perform Randomized Search with Cross-Validation
random_search_logreg = RandomizedSearchCV(
    estimator=logreg_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=2,
    random_state=12,
    verbose=1,
    n_jobs=-1
)

# Fit the RandomizedSearchCV
random_search_logreg.fit(X_train_resampled, y_train_resampled)

# Best parameters and model performance
print("Best Parameters for Logistic Regression:", random_search_logreg.best_params_)

# Evaluate the best model
best_logreg = random_search_logreg.best_estimator_
y_pred_logreg = best_logreg.predict(X_test)
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_logreg))

Original Training Data Distribution:
Training_Program
0    26
2    24
3    22
1    21
4    18
5    18
6    12
7    11
Name: count, dtype: int64

Training Data Distribution After SMOTE:
Training_Program
7    26
1    26
2    26
4    26
6    26
3    26
5    26
0    26
Name: count, dtype: int64
Fitting 2 folds for each of 50 candidates, totalling 100 fits
Best Parameters for Logistic Regression: {'solver': 'saga', 'multi_class': 'multinomial', 'class_weight': 'balanced', 'C': 10}
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.80      0.57      0.67         7
           1       0.75      0.60      0.67         5
           2       0.33      0.33      0.33         6
           3       0.29      0.33      0.31         6
           4       0.25      0.20      0.22         5
           5       0.29      0.50      0.36         4
           6       0.50      0.67      0.57         3
           7       0.50      0.33      

12 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\licop\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\licop\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\licop\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1267, in fit
    multi_class = _check_multi_class(multi_class, solver, len(self.classes_))
                  ^^^^^^^^^^^^^^^^^^^^^^^