In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from catboost import CatBoostClassifier
# from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import f1_score
from category_encoders import CountEncoder, TargetEncoder
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
import warnings

from sklearn.compose import make_column_transformer

model_number = 'interactive_numerical_only'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv('../input/playground-series-s3e22/train.csv')
df_test = pd.read_csv('../input/playground-series-s3e22/test.csv')

df_train.head()

In [None]:
dropped_columns = ['id']

df_train_model = df_train.drop(dropped_columns, axis=1)
df_test_model = df_test.drop(dropped_columns, axis=1)

# Assign train and test for X and y
X_train = df_train_model.drop('outcome', axis=1)
y_train = df_train_model['outcome']

X_test = df_test_model.copy()

In [None]:
y_train[:5]

In [None]:
TARGET = "outcome"

combined_df = pd.concat([X_train, X_test])
combined_df.shape, X_train.shape, X_test.shape

In [None]:
X_train.head(10)

In [None]:
object_cat_features = list(X_train.select_dtypes('object').columns)
numerical_features = list(X_train.select_dtypes(include=['int', 'float']).columns)

num_cat_features = ['lesion_3', 'lesion_2', 'hospital_number']

cat_features = object_cat_features + num_cat_features
num_features = [feat for feat in numerical_features if feat not in num_cat_features]

In [None]:
# for col in cat_features:
#     X_train[col] = X_train[col].astype(str)

In [None]:
transformer = make_column_transformer(
        (OneHotEncoder(sparse=False, handle_unknown='ignore'), object_cat_features),
        remainder='passthrough')

transformed = transformer.fit_transform(X_train)
feature_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]
transformed_df = pd.DataFrame(transformed, columns=feature_names)
transformed_df.sample(3)

In [None]:
X_test_transformed = transformer.transform(X_test)
transformed_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)
transformed_test_df.head()

In [None]:
transformed_df.shape

In [None]:
# # Replace 'None' values with the median of respective columns
# for col in cat_features:
#     # Calculate the median excluding 'None' values
#     mode_value = X_train[col][X_train[col] != 'none'].mode().iloc[0]
    
#     # Replace 'None' values with the median
#     X_train[col] = X_train[col].replace('none', mode_value)

In [None]:
def generate_polynomial_features(df, degree, df_features):
    """
    Generate polynomial features for the specified columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    degree : int
        The degree of the polynomial features to generate.
    df_features : list
        A list of feature names to be used for generating polynomial features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the polynomial features added.
    """
    # Get the list of features to create polynomial features
    features = [col for col in df.columns if col in df_features]

    # Create a PolynomialFeatures object with the specified degree, no interaction features, and no bias term
    poly = PolynomialFeatures(degree, interaction_only=False, include_bias=False)

    # Fit and transform the selected features in the DataFrame
    poly_features = poly.fit_transform(df[features])

    # Get the feature names for the generated polynomial features
    poly_features_names = poly.get_feature_names_out(features)

    # Create a new DataFrame with the generated polynomial features
    poly_df = pd.DataFrame(poly_features, columns=poly_features_names)

    # Keep only the columns with polynomial features of the specified degree
    poly_df = poly_df[[f"{col}^{degree}" for col in features]]

    # Concatenate the original DataFrame and the polynomial features DataFrame
    result_combined = pd.concat([df, poly_df], axis=1)

    return df, poly_df, result_combined

In [None]:
def generate_domain_features(df, df_features):
    """
    Generate domain-specific features as ratios between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating domain-specific features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the domain-specific features added.
    """
    # Get the list of features to create domain-specific features
    features = [col for col in df.columns if col in df_features]
    new_features = []
    
    df_new_features = pd.DataFrame()

    # Iterate through the features and create domain-specific features as ratios
    for i in range(len(features)):
        for j in range(len(features)):
            # Check if the features are different
            if i != j:
                # Generate a new feature name for the domain-specific feature
                new_feature_name = f"{features[i]}_{features[j]}_ratio"
                
                # Create the domain-specific feature by dividing the values of the two original features
                # If the denominator is 0, use a small value (1e-6) to avoid division by zero
                df_new_features[new_feature_name] = df[features[i]] / np.where(df[features[j]] == 0, 1e-6, df[features[j]])
                
                # Add the new feature name to the list of new features
                new_features.append(new_feature_name)
    
    df_combined = pd.concat([df, df_new_features], axis=1)

    return df, df_new_features, df_combined


In [None]:
# X_trains, df_interactive, X_train_complete = generate_polynomial_features(X_train, 2, num_features)
# X_train_complete.head()

In [None]:
# X_trains.shape, df_interactive.shape, X_train_complete.shape

In [None]:
autogluon_data = pd.concat([X_train_complete, y_train], axis=1)
autogluon_data.head()

In [None]:
interactive_cat_features = list(X_train_complete.select_dtypes('object').columns)
X_train_complete[interactive_cat_features]

In [None]:
def catboost_kfold_feature_importance(X_train, y_train, cat_features=None, n_splits=5, random_state=5):
    """
    Perform K-Fold cross-validation with CatBoost and calculate feature importances.

    Args:
    - X_train: DataFrame, training features.
    - y_train: Series, training target.
    - cat_features: List of categorical feature names (default is None).
    - n_splits: Number of K-Fold splits (default is 5).
    - random_state: Random seed for reproducibility (default is 5).

    Returns:
    - fi_df: DataFrame, feature importances with fold-wise and average values.
    """    
    # Initialize DataFrame to store feature importances
    fi_df = pd.DataFrame({'Feature': X_train.columns})

    # Initialize K-Fold cross-validator
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Create empty array to store fold AUC scores
    fold_scores = np.zeros(n_splits)

    # Initialize CatBoost model
    model = CatBoostClassifier(random_state=random_state, cat_features=cat_features, verbose=False)

    # Perform K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Fit the CatBoost model
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val, y_val), verbose=100, early_stopping_rounds=100)

        # Calculate fold AUC score
        y_pred_val = model.predict(X_val)
        fold_score = f1_score(y_val, y_pred_val, average='micro')
        fold_scores[fold] = fold_score

        # Record feature importances for this fold
        feature_importance = model.get_feature_importance()
        fi_df[f'Fold_{fold + 1}'] = feature_importance

    # Calculate and append average feature importance
    fi_df['Average'] = fi_df.iloc[:, 1:].mean(axis=1)

    fi_df.to_csv('catboost_feature_importance.csv', index=False)

    return fi_df

In [None]:
def plot_catboost_cat_feature_importance(X_train, y_train, cat_features, figsize=(16, 12)):
    fi_df = catboost_kfold_feature_importance(X_train, y_train, cat_features=cat_features)
    fi_df.sort_values(by='Average', ascending=False, inplace=True)

    plt.figure(figsize=figsize)
    sns.barplot(
        x=fi_df['Average'],
        y=fi_df['Feature'],
    )

    plt.title('Features Importance (avg over folds)')
    plt.show()

In [None]:
plot_catboost_cat_feature_importance(transformed_df, y_train, cat_features=None, figsize=(32, 32))

In [None]:
feats = pd.read_csv('catboost_feature_importance.csv')
feats.sort_values(by='Average', ascending=False, inplace=True)
feats_needed = feats[feats['Average'] >= 20]['Feature'].to_list()
feats_needed

In [None]:
interactive_cat_features = list(df_interactive[feats].select_dtypes('object').columns)
interactive_cat_features

In [None]:
viz.plot_autogluon_feature_importance(autogluon_data, TARGET, 60, figsize=(32, 32))

In [None]:
# catboost_result = pd.read_csv('catboost_feature_importance.csv')
# catboost_result = catboost_result.sort_values(by='Average', ascending=False)

# catboost_features = X_train.columns

model = CatBoostClassifier(random_state=5, verbose=False)

rfecv = RFECV(estimator=model, cv=3, scoring='f1_micro', n_jobs=-1, verbose=1)

rfecv.fit(df_interactive, y_train)

In [None]:
# Get the RFECV ranking of all the features to select which ones to use
# Rank 1 are the features best suited for modelling but rank 2 features aren't bad
# And so on
rfecv_features = pd.DataFrame({'Feature': df_interactive.columns,
                               'Ranking': rfecv.ranking_})

rfecv_features.sort_values(by='Ranking', inplace=True)
rfecv_features

In [None]:
selected_features = [i for i, selected in enumerate(rfecv.support_) if selected]
selected_features

In [None]:
# Get a list of the rank 1 features
rfecv_rank_1 = list(df_interactive.columns[selected_features])
rfecv_rank_1

In [None]:
# Initialize CatBoost model
# model = CatBoostClassifier(random_state=5, cat_features=None, verbose=5)
model = CatBoostClassifier(random_state=5, verbose=False)

num_folds = 5
cv = RepeatedStratifiedKFold(n_splits=num_folds, n_repeats=3, random_state=5)

# scores = cross_val_score(model, X_train.drop(dropped_columns_low_fi, axis=1), y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
scores = cross_val_score(model, transformed_df, y_train, cv=cv, n_jobs=-1, scoring='f1_micro')

In [None]:
scores

In [None]:
# Print the scores average and std for each fold
print("Scores Mean:", np.mean(scores))
print("Scores Std:", np.std(scores))

In [None]:
# Fit the model on the entire training data
model.fit(transformed_df, y_train)

# Predict on the X_test data
predictions = model.predict(transformed_test_df)

In [None]:
predictions

In [None]:
predictions_class = predictions.squeeze()

# Create a DataFrame for submission
submission_df = pd.DataFrame(
    {'id': df_test['id'],
    'outcome': predictions_class})

# Save to CSV for submission
submission_df.to_csv(f'submission_{model_number}_catboost.csv', index=False)

In [None]:
# kfold_splits = md.generate_kfold(autogluon_data, y='outcome')
kfold_splits = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=5)

f1_scores = []

for fold, (train_index, test_index) in enumerate(kfold_splits.split(autogluon_data, autogluon_data['outcome'])):
    # Split the dataset into train and test sets
    train_data = autogluon_data.iloc[train_index]
    test_data = autogluon_data.iloc[test_index]

    # Print the shapes of train and test data for debugging
    print(f"Fold {fold + 1} - Train data shape: {train_data.shape}, Test data shape: {test_data.shape}")

    predictor = TabularPredictor(problem_type="multiclass", label="outcome", eval_metric='f1_micro')

    predictor.fit(train_data=train_data,
                presets="medium_quality",
                time_limit=60,
    )

    performance = predictor.evaluate(test_data)

    print(f"Fold {fold + 1} - F1 Score: {performance['f1_micro']}")

    f1_scores.append(performance['f1_micro'])

# Print the f1 and RMSE scores for each fold
print("f1 Scores Mean:", np.mean(f1_scores))
print("f1 Scores STD:", np.std(f1_scores))

In [None]:
# Initialize AutoGluon for classification
autogluon_model = TabularPredictor(problem_type="multiclass", label="outcome", eval_metric='f1_micro')

# Fit AutoGluon to your data
autogluon_model.fit(train_data=autogluon_data, time_limit=300)

In [None]:
pred_autogluon = autogluon_model.predict(transformed_test_df)
pred_autogluon

In [None]:
# Create a DataFrame for submission
testing_df = pd.DataFrame(
    {'id': df_test['id'],
    'outcome': pred_autogluon})

# testing_df.head(50)
# Save to CSV for testing
testing_df.to_csv(f'submission_{model_number}_autogluon.csv', index=False)