In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from category_encoders import CountEncoder, TargetEncoder
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import OneHotEncoder
import warnings

from sklearn.compose import make_column_transformer

model_number = 'hyperparameter_tuning'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv('../input/playground-series-s3e22/train.csv')
df_test = pd.read_csv('../input/playground-series-s3e22/test.csv')

df_train.head()

In [None]:
dropped_columns = ['id']

df_train_model = df_train.drop(dropped_columns, axis=1)
df_test_model = df_test.drop(dropped_columns, axis=1)

# Assign train and test for X and y
X_train = df_train_model.drop('outcome', axis=1)
y_train = df_train_model['outcome']

X_test = df_test_model.copy()

In [None]:
y_train[:5]

In [None]:
TARGET = "outcome"

combined_df = pd.concat([X_train, X_test])
combined_df.shape, X_train.shape, X_test.shape

In [None]:
X_train.head(10)

In [None]:
object_cat_features = list(X_train.select_dtypes('object').columns)
numerical_features = list(X_train.select_dtypes(include=['int', 'float']).columns)

num_cat_features = ['lesion_3', 'lesion_2', 'hospital_number']

cat_features = object_cat_features + num_cat_features
num_features = [feat for feat in numerical_features if feat not in num_cat_features]

In [None]:
transformer = make_column_transformer(
        (OneHotEncoder(sparse=False, handle_unknown='ignore'), object_cat_features),
        remainder='passthrough')

transformed = transformer.fit_transform(X_train)
feature_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]
transformed_df = pd.DataFrame(transformed, columns=feature_names)
transformed_df.sample(3)

In [None]:
X_test_transformed = transformer.transform(X_test)
transformed_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)
transformed_test_df.head()

In [None]:
transformed_df.shape

In [None]:
# # Replace 'None' values with the median of respective columns
# for col in cat_features:
#     # Calculate the median excluding 'None' values
#     mode_value = X_train[col][X_train[col] != 'none'].mode().iloc[0]
    
#     # Replace 'None' values with the median
#     X_train[col] = X_train[col].replace('none', mode_value)

In [None]:
def generate_comprehensive_interactive_features(df, df_features, numerical_features):
    """
    Generate interaction features between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating interaction features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the interaction features added.
    """
    df_interactive = pd.DataFrame()

    # Get the list of features to create interaction terms
    features = [col for col in df.columns if col in df_features]

    # Iterate through the features and create interaction terms
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            # Generate a new feature name for the interaction term
            new_feature_name = f"{features[i]}_{features[j]}"

            if features[i] in numerical_features and features[j] in numerical_features:
                # Create the interaction feature by multiplying the values of the two original features
                df_interactive[new_feature_name] = df[features[i]] * df[features[j]]

            else:
                df_interactive[new_feature_name] = df[features[i]].astype(str) + '_' + df[features[j]].astype(str)

    combined_df = pd.concat([df, df_interactive], axis=1)
    
    return df, df_interactive, combined_df

In [None]:
X_trains, df_interactive, X_train_complete = generate_comprehensive_interactive_features(transformed_df, transformed_df.columns, transformed_df.columns)
X_train_complete.head()

In [None]:
X_tests, df_test_interactive, X_test_complete = generate_comprehensive_interactive_features(transformed_test_df, transformed_test_df.columns, transformed_test_df.columns)
X_test_complete.head()

In [None]:
X_trains.shape, df_interactive.shape, X_train_complete.shape

In [None]:
def catboost_kfold_feature_importance(X_train, y_train, cat_features=None, n_splits=5, random_state=5):
    """
    Perform K-Fold cross-validation with CatBoost and calculate feature importances.

    Args:
    - X_train: DataFrame, training features.
    - y_train: Series, training target.
    - cat_features: List of categorical feature names (default is None).
    - n_splits: Number of K-Fold splits (default is 5).
    - random_state: Random seed for reproducibility (default is 5).

    Returns:
    - fi_df: DataFrame, feature importances with fold-wise and average values.
    """    
    # Initialize DataFrame to store feature importances
    fi_df = pd.DataFrame({'Feature': X_train.columns})

    # Initialize K-Fold cross-validator
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Create empty array to store fold AUC scores
    fold_scores = np.zeros(n_splits)

    # Initialize CatBoost model
    model = CatBoostClassifier(random_state=random_state, cat_features=cat_features, verbose=False)

    # Perform K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Fit the CatBoost model
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val, y_val), verbose=100, early_stopping_rounds=100)

        # Calculate fold AUC score
        y_pred_val = model.predict(X_val)
        fold_score = f1_score(y_val, y_pred_val, average='micro')
        fold_scores[fold] = fold_score

        # Record feature importances for this fold
        feature_importance = model.get_feature_importance()
        fi_df[f'Fold_{fold + 1}'] = feature_importance

    # Calculate and append average feature importance
    fi_df['Average'] = fi_df.iloc[:, 1:].mean(axis=1)

    fi_df.to_csv('catboost_feature_importance.csv', index=False)

    return fi_df

In [None]:
def plot_catboost_cat_feature_importance(X_train, y_train, cat_features, figsize=(16, 12)):
    fi_df = catboost_kfold_feature_importance(X_train, y_train, cat_features=cat_features)
    fi_df.sort_values(by='Average', ascending=False, inplace=True)

    plt.figure(figsize=figsize)
    sns.barplot(
        x=fi_df['Average'],
        y=fi_df['Feature'],
    )

    plt.title('Features Importance (avg over folds)')
    plt.show()

In [None]:
plot_catboost_cat_feature_importance(X_train_complete, y_train, cat_features=None, figsize=(32, 32))

In [None]:
feats = pd.read_csv('catboost_feature_importance.csv')
feats.sort_values(by='Average', ascending=False, inplace=True)
feats_needed = feats[feats['Average'] >= 0.5]['Feature'].to_list()
feats_needed

In [None]:
len(feats_needed)

In [None]:
# Initialize CatBoost model
# model = CatBoostClassifier(random_state=5, cat_features=None, verbose=5)
model = CatBoostClassifier(random_state=5, verbose=False)

num_folds = 5
cv = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=3, random_state=5)

# scores = cross_val_score(model, X_train.drop(dropped_columns_low_fi, axis=1), y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
scores = cross_val_score(model, X_train_complete[feats_needed], y_train, cv=cv, n_jobs=-1, scoring='f1_micro')

In [None]:
scores

In [None]:
# Print the scores average and std for each fold
print("Scores Mean:", np.mean(scores))
print("Scores Std:", np.std(scores))

In [None]:
# Fit the model on the entire training data
model.fit(X_train_complete[feats_needed], y_train)

# Predict on the X_test data
predictions = model.predict(X_test_complete[feats_needed])

In [None]:
predictions[:5]

In [None]:
predictions_class = predictions.squeeze()

# Create a DataFrame for submission
submission_df = pd.DataFrame(
    {'id': df_test['id'],
    'outcome': predictions_class})

# Save to CSV for submission
submission_df.to_csv(f'submission_{model_number}_catboost.csv', index=False)