In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, make_scorer
import numpy as np
import pandas as pd
import warnings

In [None]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
# plt.style.use('seaborn-v0_8-dark')

In [None]:
# Load data
df = pd.read_csv('../Data/train_data_scaled_imputed.csv', sep=',')
df_test = pd.read_csv('../Data/test_data.csv', sep=',')

In [4]:
df.set_index(df.columns[0], inplace=True)
df.index.name = None

In [5]:
df.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,,1197.0,0.0,1.0,49.0,0.0,7.0,,1.0,,44.0,0.0,27.0,10.0,62.0,3935.0,0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,2044.0,0.0,3.0,61.0,0.0,5.0,0.013836,0.0,0.041667,23.0,0.0,97.0,49.0,38.0,4606.0,1,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,,894.0,0.0,3.0,35.0,0.0,0.0,,1.0,,56.0,1.0,79.0,7.0,10.0,3075.0,0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,0.0,0.970268,0.84842,,1710.0,1.0,1.0,13.0,0.0,0.0,,1.0,,62.0,1.0,16.0,43.0,36.0,3088.0,0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,,893.0,0.0,2.0,51.0,0.0,3.0,,1.0,,44.0,3.0,31.0,10.0,38.0,2362.0,0,0.833333


In [6]:
# we do this only for testing purposes, this needs fixing
df.drop(columns=['C-2 Date', 'C-3 Date', 'First Hearing Date', 'IME-4 Count'], inplace=True)

In [None]:
# we do this for testing purposes, this needs to be ran on the whole dataset
df = df.sample(n=10000)

In [8]:
X, y = df.drop(columns='Claim Injury Type'), df['Claim Injury Type']

In [9]:
# Initial train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

In [10]:
# Define models to test
models = {
    # 'RandomForest': RandomForestClassifier(random_state=55),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=20)
}

# Define parameter grid for each model
param_grid = [
    # {
    #     'feature_selection__estimator': [RandomForestClassifier(random_state=55)],
    #     'feature_selection__n_features_to_select': [1, 2, 3, 4],
    #     'classifier': [RandomForestClassifier(random_state=55)],
    #     'classifier__n_estimators': [50, 100, 200],
    #     'classifier__max_depth': [3, 5, 10]
    # },
    {
        'feature_selection__estimator': [LogisticRegression(max_iter=1000, random_state=20)],
        'feature_selection__n_features_to_select': [5, 8, 12, 15, 18, 20],
        'classifier': [LogisticRegression(max_iter=1000, random_state=20)],
        'classifier__C': [0.1, 1],
        'classifier__solver': ['lbfgs', 'sag', 'saga']
    }
]

In [11]:
# Outer cross-validation on training data
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=20)

In [12]:
# Scoring function
f1_macro = make_scorer(f1_score, average='macro')

In [14]:
outer_scores = []

# Perform nested cross-validation on the training data
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Pipeline with RFE and model placeholder
    pipeline = Pipeline([
        ('feature_selection', RFE(estimator=LogisticRegression(max_iter=1000, random_state=20))),  # Placeholder
        ('classifier', LogisticRegression(max_iter=1000, random_state=20))   # Placeholder
    ])
    
    # Inner cross-validation and grid search
    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=20)
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=inner_cv, scoring=f1_macro)
    
    # Fit on inner cross-validation
    grid_search.fit(X_train_fold, y_train_fold)
    
    # Best model evaluation on the outer validation fold
    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val_fold)
    outer_score = f1_score(y_val_fold, y_val_pred, average='macro')  # Use f1_score directly
    outer_scores.append(outer_score)
    
    # Print results for this outer fold
    print(f"Outer Fold Score: {outer_score:.4f}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print("-" * 30)

# Average score across outer folds (performance estimate)
print(f"Mean Macro F1 Score (Outer CV): {np.mean(outer_scores):.4f}")
print(f"Standard Deviation (Outer CV): {np.std(outer_scores):.4f}")

# Final model evaluation on the test set
final_model = grid_search.best_estimator_
final_model.fit(X_train, y_train)
y_test_pred = final_model.predict(X_test)
test_score = f1_score(y_test, y_test_pred, average='macro')
print(f"Final Test Set Macro F1 Score: {test_score:.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Outer Fold Score: 0.2452
Best Parameters: {'classifier': LogisticRegression(max_iter=1000, random_state=20), 'classifier__C': 1, 'classifier__solver': 'sag', 'feature_selection__estimator': LogisticRegression(max_iter=1000, random_state=20), 'feature_selection__n_features_to_select': 18}
------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Outer Fold Score: 0.2497
Best Parameters: {'classifier': LogisticRegression(max_iter=1000, random_state=20), 'classifier__C': 1, 'classifier__solver': 'lbfgs', 'feature_selection__estimator': LogisticRegression(max_iter=1000, random_state=20), 'feature_selection__n_features_to_select': 18}
------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Outer Fold Score: 0.2540
Best Parameters: {'classifier': LogisticRegression(max_iter=1000, random_state=20), 'classifier__C': 1, 'classifier__solver': 'lbfgs', 'feature_selection__estimator': LogisticRegression(max_iter=1000, random_state=20), 'feature_selection__n_features_to_select': 18}
------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Outer Fold Score: 0.2566
Best Parameters: {'classifier': LogisticRegression(max_iter=1000, random_state=20), 'classifier__C': 1, 'classifier__solver': 'lbfgs', 'feature_selection__estimator': LogisticRegression(max_iter=1000, random_state=20), 'feature_selection__n_features_to_select': 18}
------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Outer Fold Score: 0.2533
Best Parameters: {'classifier': LogisticRegression(max_iter=1000, random_state=20), 'classifier__C': 1, 'classifier__solver': 'lbfgs', 'feature_selection__estimator': LogisticRegression(max_iter=1000, random_state=20), 'feature_selection__n_features_to_select': 18}
------------------------------
Mean Macro F1 Score (Outer CV): 0.2518
Standard Deviation (Outer CV): 0.0040


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Final Test Set Macro F1 Score: 0.2461


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
