In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from itertools import combinations, product
import warnings

from sklearn.compose import make_column_transformer

import joblib

model_number = 'final_model'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv('../input/playground-series-s3e22/train.csv')
df_test = pd.read_csv('../input/playground-series-s3e22/test.csv')

df_train.head()

In [None]:
dropped_columns = ['id']
TARGET = "outcome"

df_train_model = df_train.drop(dropped_columns, axis=1)
df_test_model = df_test.drop(dropped_columns, axis=1)

# Assign train and test for X and y
X_train = df_train_model.drop('outcome', axis=1)
y_train = df_train_model['outcome']

X_test = df_test_model.copy()

In [None]:
y_train[:5]

In [None]:
object_cat_features = list(X_train.select_dtypes('object').columns)
numerical_features = list(X_train.select_dtypes(include=['int', 'float']).columns)

num_cat_features = ['lesion_3', 'lesion_2', 'hospital_number']

cat_features = object_cat_features + num_cat_features
num_features = [feat for feat in numerical_features if feat not in num_cat_features]
main_features = cat_features + num_features
main_features

In [None]:
for col in cat_features:
    X_train[f'{col}_missing'] = X_train[col].isna().astype(int)
    X_train[col].fillna('Unknown', inplace=True)
    X_test[f'{col}_missing'] = X_test[col].isna().astype(int)
    X_test[col].fillna('Unknown', inplace=True)
    
X_train.shape, X_test.shape

In [None]:
X_train['abdomen'].value_counts()

In [None]:
X_train['abdomen_missing'].value_counts()

In [None]:
# Create a ColumnTransformer
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), object_cat_features),
    remainder='passthrough')

# Fit and transform the data
transformed = transformer.fit_transform(X_train)

# Get the transformed feature names
transformed_feature_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]

# Create a DataFrame for the transformed features
transformed_df = pd.DataFrame(transformed, columns=transformed_feature_names)

# Concatenate the transformed features DataFrame with the original DataFrame
combined_df_train = pd.concat([X_train[object_cat_features].reset_index(drop=True), transformed_df], axis=1)
combined_df_train.head()

In [None]:
combined_df_train['abdomen_missing'].value_counts()

In [None]:
X_test_transformed = transformer.transform(X_test)
transformed_test_df = pd.DataFrame(X_test_transformed, columns=transformed_feature_names)

# Concatenate the transformed features DataFrame with the original DataFrame
combined_df_test = pd.concat([X_test[object_cat_features].reset_index(drop=True), transformed_test_df], axis=1)
combined_df_test.head()

In [None]:
def generate_comprehensive_interactive_features(df, df_features, numerical_features):
    """
    Generate interaction features between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating interaction features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the interaction features added.
    """
    df_interactive = pd.DataFrame()

    # Get the list of features to create interaction terms
    features = [col for col in df.columns if col in df_features]

    # Iterate through the features and create interaction terms
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            # Generate a new feature name for the interaction term
            new_feature_name = f"{features[i]}_{features[j]}"

            if features[i] in numerical_features and features[j] in numerical_features:
                # Create the interaction feature by multiplying the values of the two original features
                df_interactive[new_feature_name] = df[features[i]] * df[features[j]]

            else:
                df_interactive[new_feature_name] = df[features[i]].astype(str) + '_' + df[features[j]].astype(str)

    combined_df = pd.concat([df, df_interactive], axis=1)
    
    return df_interactive, combined_df

In [None]:
df_interactive, X_train_complete_interactive = generate_comprehensive_interactive_features(X_train[main_features], main_features, num_features)
df_test_interactive, X_test_complete_interactive = generate_comprehensive_interactive_features(X_test[main_features], main_features, num_features)
df_interactive.head()

In [None]:
def generate_domain_features(df, df_features):
    """
    Generate domain-specific features as ratios between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating domain-specific features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the domain-specific features added.
    """
    # Get the list of features to create domain-specific features
    features = [col for col in df.columns if col in df_features]
    
    df_new_features = pd.DataFrame()

    # Iterate through the features and create domain-specific features as ratios
    for i in range(len(features)):
        for j in range(len(features)):
            # Check if the features are different
            if i != j:
                # Generate a new feature name for the domain-specific feature
                new_feature_name = f"{features[i]}_{features[j]}_ratio"
                
                # Create the domain-specific feature by dividing the values of the two original features
                # If the denominator is 0, use a small value (1e-6) to avoid division by zero
                df_new_features[new_feature_name] = df[features[i]] / np.where(df[features[j]] == 0, 1e-6, df[features[j]])
                
    df_combined = pd.concat([df, df_new_features], axis=1)

    return df_new_features, df_combined

In [None]:
df_domain, X_train_complete_domain = generate_domain_features(X_train[main_features], numerical_features)
df_test_domain, X_test_complete_domain = generate_domain_features(X_test[main_features], numerical_features)
df_domain.head()

In [None]:
def generate_polynomial_features(df, degree, df_features):
    """
    Generate polynomial features for the specified columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    degree : int
        The degree of the polynomial features to generate.
    df_features : list
        A list of feature names to be used for generating polynomial features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the polynomial features added.
    """
    # Get the list of features to create polynomial features
    features = [col for col in df.columns if col in df_features]

    # Create a PolynomialFeatures object with the specified degree, no interaction features, and no bias term
    poly = PolynomialFeatures(degree, interaction_only=False, include_bias=False)

    # Fit and transform the selected features in the DataFrame
    poly_features = poly.fit_transform(df[features])

    # Get the feature names for the generated polynomial features
    poly_features_names = poly.get_feature_names_out(features)

    # Create a new DataFrame with the generated polynomial features
    poly_df = pd.DataFrame(poly_features, columns=poly_features_names)

    # Keep only the columns with polynomial features of the specified degree
    poly_df = poly_df[[f"{col}^{degree}" for col in features]]

    # Concatenate the original DataFrame and the polynomial features DataFrame
    result_combined = pd.concat([df, poly_df], axis=1)

    return poly_df, result_combined

In [None]:
df_poly_2, X_train_complete_poly_2 = generate_polynomial_features(X_train[main_features], 2, num_features)
df_test_poly_2, X_test_complete_poly_2 = generate_polynomial_features(X_test[main_features], 2, num_features)
df_poly_2.head()

In [None]:
df_poly_3, X_train_complete_poly_3 = generate_polynomial_features(X_train[main_features], 3, num_features)
df_test_poly_3, X_test_complete_poly_3 = generate_polynomial_features(X_test[main_features], 3, num_features)
df_poly_3.head()

In [None]:
df_final = pd.concat([combined_df_train, df_interactive, df_domain, df_poly_2, df_poly_3], axis=1)
df_test_final = pd.concat([combined_df_test, df_test_interactive, df_test_domain, df_test_poly_2, df_test_poly_3], axis=1)

In [None]:
df_final.head()

In [None]:
df_final.shape, df_test_final.shape

In [None]:
final_object_cat_features = list(df_final.select_dtypes('object').columns)

In [None]:
def catboost_kfold_feature_importance(X_train, y_train, cat_features=None, n_splits=5, random_state=5):
    """
    Perform K-Fold cross-validation with CatBoost and calculate feature importances.

    Args:
    - X_train: DataFrame, training features.
    - y_train: Series, training target.
    - cat_features: List of categorical feature names (default is None).
    - n_splits: Number of K-Fold splits (default is 5).
    - random_state: Random seed for reproducibility (default is 5).

    Returns:
    - fi_df: DataFrame, feature importances with fold-wise and average values.
    """    
    # Initialize DataFrame to store feature importances
    fi_df = pd.DataFrame({'Feature': X_train.columns})

    # Initialize K-Fold cross-validator
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Create empty array to store fold AUC scores
    fold_scores = np.zeros(n_splits)

    # Initialize CatBoost model
    model = CatBoostClassifier(random_state=random_state, cat_features=cat_features, verbose=False)

    # Perform K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Fit the CatBoost model
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val, y_val), verbose=100, early_stopping_rounds=100)

        # Calculate fold AUC score
        y_pred_val = model.predict(X_val)
        fold_score = f1_score(y_val, y_pred_val, average='micro')
        fold_scores[fold] = fold_score

        # Record feature importances for this fold
        feature_importance = model.get_feature_importance()
        fi_df[f'Fold_{fold + 1}'] = feature_importance

    # Calculate and append average feature importance
    fi_df['Average'] = fi_df.iloc[:, 1:].mean(axis=1)

    fi_df.to_csv('catboost_feature_importance.csv', index=False)

    return fi_df

In [None]:
def plot_catboost_cat_feature_importance(X_train, y_train, cat_features, figsize=(16, 12)):
    fi_df = catboost_kfold_feature_importance(X_train, y_train, cat_features=cat_features)
    fi_df.sort_values(by='Average', ascending=False, inplace=True)

    plt.figure(figsize=figsize)
    sns.barplot(
        x=fi_df['Average'],
        y=fi_df['Feature'],
    )

    plt.title('Features Importance (avg over folds)')
    plt.show()

In [None]:
plot_catboost_cat_feature_importance(df_final, y_train, cat_features=final_object_cat_features, figsize=(32, 32))

In [None]:
feats = pd.read_csv('catboost_feature_importance.csv')
feats.sort_values(by='Average', ascending=False, inplace=True)
feats.head(10)

In [None]:
feats_in_use = feats[(feats['Fold_1'] > 0.0001) & (feats['Fold_2'] > 0.0001) & (feats['Fold_3'] > 0.0001) & (feats['Fold_4'] > 0.0001) & (feats['Fold_5'] > 0.0001) & (feats['Average'] > 0)]['Feature'].to_list()
len(feats_in_use)

In [None]:
# # feats_in_use = feats[feats['Average'] > 0]['Feature'].to_list()
# feats_in_use = ['lesion_3_lesion_1', 
#                 'age_pain', 
#                 'age_lesion_1', 
#                 'surgery_pain',
#                 'packed_cell_volume_total_protein',
#                 'packed_cell_volume_total_protein_ratio',
#                 'surgery_surgical_lesion',
#                 'pain_lesion_3',
#                 'nasogastric_reflux_ph_total_protein_ratio',
#                 'abdomo_protein_total_protein_ratio',
#                 'lesion_3_packed_cell_volume',
#                 'abdominal_distention_abdomo_appearance',
#                 'rectal_temp_nasogastric_reflux_ph',
#                 'rectal_exam_feces_surgical_lesion',
#                 'age_packed_cell_volume',
#                 'nasogastric_reflux_ph_total_protein',
#                 'nasogastric_tube_nasogastric_reflux',
#                 'surgery_peristalsis',
#                 'peripheral_pulse_pain',
#                ]

# len(feats_in_use)

In [None]:
feats_in_use_object = list(df_final[feats_in_use].select_dtypes('object').columns)
len(feats_in_use_object)

In [None]:
num_folds = 5
cv = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=3, random_state=5)

def calculate_cv_score(features):
    if not features:
        return -np.inf # Return negative inifinity if no features are selected
    input_df = df_final[features]
    
    feats_in_use_object = list(df_final[features].select_dtypes('object').columns)
    print("The number of categorical features: ", len(feats_in_use_object))
    print()
    
    if feats_in_use_object:
        # Initialize CatBoost model
        model = CatBoostClassifier(random_state=5, cat_features=feats_in_use_object, verbose=500)
    else:
        model = CatBoostClassifier(random_state=5, cat_features=None, verbose=500)
        
    scores = cross_val_score(model, input_df, y_train, cv=cv, n_jobs=-1, scoring='f1_micro')
    print("Scores Mean:", np.mean(scores))
    print()
    return np.mean(scores)

In [None]:
# Initialize the list of different features
main_features = [] # The final features list
unused_features = [] # The features not to be used in the final model

In [None]:
# Start the feature selection process
best_score = -np.inf # Initialize with a low score

for feature in feats_in_use:
    main_features.append(feature)
    print(main_features)
    print()
    cv_score = calculate_cv_score(main_features)
    if cv_score > best_score:
        best_score = cv_score
        print('Current Best CV Score is: ', best_score)
        print()
    else:
        main_features.remove(feature)
        unused_features.append(feature)
        print('Current Best CV Score is: ', best_score)
        print()

In [None]:
main_features = ['lesion_3_lesion_1', 'age_pain', 'surgery_pain', 'packed_cell_volume_total_protein', 'packed_cell_volume_total_protein_ratio', 'pain_lesion_3', 'hospital_number_total_protein_ratio', 'nasogastric_reflux_ph_total_protein_ratio', 'respiratory_rate_total_protein_ratio', 'lesion_3_packed_cell_volume', 'surgery_cp_data', 'pain_abdominal_distention', 'nasogastric_reflux_ph_abdomo_protein', 'surgery_respiratory_rate', 'rectal_exam_feces_surgical_lesion', 'total_protein_pulse_ratio', 'nasogastric_reflux_ph_rectal_temp_ratio', 'rectal_exam_feces_nasogastric_reflux_ph', 'pulse_hospital_number_ratio', 'packed_cell_volume_nasogastric_reflux_ph_ratio']

In [None]:
feats_in_use_object = list(df_final[main_features].select_dtypes('object').columns)
print("The number of categorical features: ", len(feats_in_use_object))
print()
    
# Initialize CatBoost model based on if there are categorical features or not
if feats_in_use_object:
    model = CatBoostClassifier(random_state=5, cat_features=feats_in_use_object, verbose=500)
else:
    model = CatBoostClassifier(random_state=5, cat_features=None, verbose=500)

In [None]:
# Fit the model on the entire training data
model.fit(df_final[main_features], y_train)

In [None]:
# Predictions on the test data
predictions = model.predict(df_test_final[main_features])

In [None]:
predictions[:5]

In [None]:
predictions_class = predictions.squeeze()

# Create a DataFrame for submission
submission_df = pd.DataFrame(
    {'id': df_test['id'],
    'outcome': predictions_class})

# Save to CSV for submission
submission_df.to_csv(f'submission_{model_number}_catboost.csv', index=False)

CV Score of 0.7306 and Public Score of 0.79878

## MODEL TUNING

In [None]:
feats_in_use_object = list(df_final[main_features].select_dtypes('object').columns)
if feats_in_use_object:
    model = CatBoostClassifier(random_state=5, cat_features=feats_in_use_object, verbose=500)
else:
    model = CatBoostClassifier(random_state=5, cat_features=None, verbose=500)

num_folds = 5

cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=5)
tuning_scores = cross_val_score(model, df_final[main_features], y_train, cv=cv, n_jobs=-1, scoring='f1_micro')

In [None]:
np.mean(tuning_scores)

In [None]:
# Get the default parameters
default_params = model.get_params()

# Print the default parameters
print(default_params)

In [None]:
def get_score(param1, param2):
    # For different metrics, tuning the hyperparameters and get a range for that parameter that is recorded
    # Initialize CatBoost model based on if there are categorical features or not
    feats_in_use_object = list(df_final[main_features].select_dtypes('object').columns)
    if feats_in_use_object:
        model = CatBoostClassifier(random_state=5, learning_rate=param1, depth=param2, cat_features=feats_in_use_object, verbose=500)
    else:
        model = CatBoostClassifier(random_state=5, learning_rate=param1, depth=param2, cat_features=None, verbose=500)
        
    num_folds = 5
    
    cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=5)
    tuning_scores = cross_val_score(model, df_final[main_features], y_train, cv=cv, n_jobs=-1, scoring='f1_micro')
    return np.mean(tuning_scores)

In [None]:
def get_score_single(param1):
    # For different metrics, tuning the hyperparameters and get a range for that parameter that is recorded
    # Initialize CatBoost model based on if there are categorical features or not
    feats_in_use_object = list(df_final[main_features].select_dtypes('object').columns)
    if feats_in_use_object:
        model = CatBoostClassifier(random_state=5,num_trees=param1, cat_features=feats_in_use_object, verbose=500)
    else:
        model = CatBoostClassifier(random_state=5, num_trees=param1, cat_features=None, verbose=500)
        
    num_folds = 5
    
    cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=5)
    tuning_scores = cross_val_score(model, df_final[main_features], y_train, cv=cv, n_jobs=-1, scoring='f1_micro')
    return np.mean(tuning_scores)

In [None]:
main_features

In [None]:
# Define hyperparameters
learning_rate = [10, 100, 1000]
# depth = [6, 7, 8]

# Initialize an empty dictionary to store the results
results = {}

In [None]:
results = {rate:get_score_single(rate) for rate in learning_rate}

In [None]:
results

In [None]:
plt.plot(list(results.keys()), list(results.values()))
plt.show()

In [None]:
for param1 in learning_rate:
    for param2 in depth:
        score = get_score(param1, param2)
        print(f'Hyperparameter 1: {param1}, Hyperparameter 2: {param2}, Score: {score}')
        results[(param1, param2)] = score

In [None]:
# Plot the results
plt.figure(figsize=(12, 12))
param_combinations = list(results.keys())
scores = list(results.values())
plt.plot([str(combination) for combination in param_combinations], scores)
# plt.plot(param_combinations, scores)
plt.xticks(rotation=45)
plt.xlabel('Hyperparameter Combinations')
plt.ylabel('F1 Micro Score')
plt.show()

In [None]:
results[(0.01, 6)]