In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from itertools import combinations, permutations
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# Load the dataset

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.columns

Index(['id', 'gravity', 'ph', 'osmo', 'cond', 'urea', 'calc', 'target'], dtype='object')

In [3]:
# Define train and test columns for feature generation

train_features = df_train.drop(["id", "target"], axis=1)
test_features = df_test.drop("id", axis=1)

In [4]:
def generate_interactive_features(df, df_features):
    """
    Generate interaction features between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating interaction features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the interaction features added.
    """
    # Get the list of features to create interaction terms
    features = [col for col in df.columns if col in df_features]
    new_features = []

    # Iterate through the features and create interaction terms
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            # Generate a new feature name for the interaction term
            new_feature_name = f"{features[i]}_{features[j]}"
            
            # Create the interaction feature by multiplying the values of the two original features
            df[new_feature_name] = df[features[i]] * df[features[j]]
            
            # Add the new feature name to the list of new features
            new_features.append(new_feature_name)
    
    return df

In [5]:
def generate_domain_features(df, df_features):
    """
    Generate domain-specific features as ratios between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating domain-specific features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the domain-specific features added.
    """
    # Get the list of features to create domain-specific features
    features = [col for col in df.columns if col in df_features]
    new_features = []

    # Iterate through the features and create domain-specific features as ratios
    for i in range(len(features)):
        for j in range(len(features)):
            # Check if the features are different
            if i != j:
                # Generate a new feature name for the domain-specific feature
                new_feature_name = f"{features[i]}_{features[j]}_ratio"
                
                # Create the domain-specific feature by dividing the values of the two original features
                # If the denominator is 0, use a small value (1e-6) to avoid division by zero
                df[new_feature_name] = df[features[i]] / np.where(df[features[j]] == 0, 1e-6, df[features[j]])
                
                # Add the new feature name to the list of new features
                new_features.append(new_feature_name)
    
    return df

In [6]:
def generate_polynomial_features(df, degree, df_features):
    """
    Generate polynomial features for the specified columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    degree : int
        The degree of the polynomial features to generate.
    df_features : list
        A list of feature names to be used for generating polynomial features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the polynomial features added.
    """
    # Get the list of features to create polynomial features
    features = [col for col in df.columns if col in df_features]

    # Create a PolynomialFeatures object with the specified degree, no interaction features, and no bias term
    poly = PolynomialFeatures(degree, interaction_only=False, include_bias=False)

    # Fit and transform the selected features in the DataFrame
    poly_features = poly.fit_transform(df[features])

    # Get the feature names for the generated polynomial features
    poly_features_names = poly.get_feature_names_out(features)

    # Create a new DataFrame with the generated polynomial features
    poly_df = pd.DataFrame(poly_features, columns=poly_features_names)

    # Keep only the columns with polynomial features of the specified degree
    poly_df = poly_df[[f"{col}^{degree}" for col in features]]

    # Concatenate the original DataFrame and the polynomial features DataFrame
    result = pd.concat([df, poly_df], axis=1)

    return result

In [7]:
# Generate additional features for the training dataset

# Generate interactive features using the specified columns in 'train_features'
df_train = generate_interactive_features(df_train, train_features)
# Generate polynomial features of degree 2 using the specified columns in 'train_features'
df_train = generate_polynomial_features(df_train, 2, train_features)
# Generate polynomial features of degree 3 using the specified columns in 'train_features'
df_train = generate_polynomial_features(df_train, 3, train_features)
# Generate domain-specific features using the specified columns in 'train_features'
df_train = generate_domain_features(df_train, train_features)

In [8]:
# Generate additional features for the test dataset

# Generate interactive features using the specified columns in 'test_features'
df_test = generate_interactive_features(df_test, test_features)
# Generate polynomial features of degree 2 using the specified columns in 'test_features'
df_test = generate_polynomial_features(df_test, 2, test_features)
# Generate polynomial features of degree 3 using the specified columns in 'test_features'
df_test = generate_polynomial_features(df_test, 3, test_features)
# Generate domain-specific features using the specified columns in 'test_features'
df_test = generate_domain_features(df_test, test_features)

In [9]:
# One-hot encode using pd.get_dummies on train and test
# Split back after encoding

# Tag the data before combining
df_train["dataset"] = "train"
df_test["dataset"] = "test"

# Combine train and test data
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# One-hot encode the combined data
df_encoded = pd.get_dummies(df_combined, columns=["gravity"])

# Split the data back into train and test
df_train = df_encoded[df_encoded["dataset"] == "train"].drop(["dataset"], axis=1)
df_test = df_encoded[df_encoded["dataset"] == "test"].drop(["dataset", "target"], axis=1)

In [10]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "target"], axis=1)
y_train = df_train["target"]

X_test = df_test.drop(["id"], axis=1)

In [11]:
# Perform cross-validation and calculate the AUC ROC

def evaluate_model(model, X, y, n_splits=5):
    auc_roc_scores = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    for train_index, test_index in kf.split(X):
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred_proba_cv = model.predict_proba(X_test_cv)[:, 1]

        auc_roc = roc_auc_score(y_test_cv, y_pred_proba_cv)
        auc_roc_scores.append(auc_roc)
    
    return [round(value, 5) for value in auc_roc_scores]

In [12]:
# Initialize the models

models = {
    "LightGBM": lgb.LGBMClassifier(random_state=5),
    "xGBoost": xgb.XGBClassifier(random_state=5),
    "CatBoost": CatBoostClassifier(silent=True, random_state=5),
    "RandomForest": RandomForestClassifier(random_state=5),
    "KNN": KNeighborsClassifier(),
}

In [13]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.7695, 0.80566, 0.74882, 0.82726, 0.70588]
Average AUC ROC: 0.77142
Std Deviation: 0.04266

Model: xGBoost
AUC ROC Scores: [0.74586, 0.78302, 0.74054, 0.80024, 0.65257]
Average AUC ROC: 0.74445
Std Deviation: 0.05111

Model: CatBoost
AUC ROC Scores: [0.77128, 0.78994, 0.72813, 0.82902, 0.74449]
Average AUC ROC: 0.77257
Std Deviation: 0.03536

Model: RandomForest
AUC ROC Scores: [0.78783, 0.81635, 0.71661, 0.82286, 0.75]
Average AUC ROC: 0.77873
Std Deviation: 0.04032

Model: KNN
AUC ROC Scores: [0.61082, 0.69245, 0.57979, 0.66246, 0.53523]
Average AUC ROC: 0.61615
Std Deviation: 0.05633



### OBSERVATION FOR 5 SPLITS
1. Baseline LightGBM is 0.77142 with Std Dev of 0.04266
2. Baseline xGBoost is 0.74445 with Std Dev of 0.05111
3. Baseline CatBoost is 0.77257 with Std Dev of 0.03536
4. Baseline RandomForest is 0.77873 with Std Dev of 0.04032
5. Baseline KNN is 0.61615 with Std Dev of 0.05633

Run time ~ 46 seconds

In [14]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train, n_splits=10)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.64583, 0.75231, 0.77222, 0.83765, 0.72381, 0.7275, 0.76961, 0.85, 0.68873, 0.73775]
Average AUC ROC: 0.75054
Std Deviation: 0.05875

Model: xGBoost
AUC ROC Scores: [0.59954, 0.78009, 0.77222, 0.82118, 0.75476, 0.7275, 0.7549, 0.85238, 0.71324, 0.73039]
Average AUC ROC: 0.75062
Std Deviation: 0.06471

Model: CatBoost
AUC ROC Scores: [0.65046, 0.83565, 0.76944, 0.85647, 0.7, 0.685, 0.80392, 0.87143, 0.75735, 0.79902]
Average AUC ROC: 0.77287
Std Deviation: 0.07111

Model: RandomForest
AUC ROC Scores: [0.61343, 0.875, 0.78889, 0.88235, 0.69762, 0.66, 0.84804, 0.86786, 0.7598, 0.76961]
Average AUC ROC: 0.77626
Std Deviation: 0.09013

Model: KNN
AUC ROC Scores: [0.51042, 0.6794, 0.63333, 0.79882, 0.64286, 0.54125, 0.7598, 0.7119, 0.48775, 0.61275]
Average AUC ROC: 0.63783
Std Deviation: 0.09837



### OBSERVATION FOR 10 SPLITS
1. Baseline LightGBM is 0.75054 with Std Dev of 0.05875
2. Baseline xGBoost is 0.75062 with Std Dev of 0.06471
3. Baseline CatBoost is 0.77287 with Std Dev of 0.07111
4. Baseline RandomForest is 0.77626 with Std Dev of 0.09013
5. Baseline KNN is 0.63783 with Std Dev of 0.09837

Run time ~ 2 minutes

In [15]:
rf = RandomForestClassifier(random_state=5)
rf.fit(X_train, y_train)

In [16]:
# Predit the final submission with Random Forest

y_final = rf.predict_proba(X_test)[:, 1]
df_test["target"] = y_final.round(1)

df_test[["id", "target"]].to_csv("submissionrf.csv", index=False)

In [17]:
cat = CatBoostClassifier(silent=True, random_state=5)
cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x2afe289c700>

In [18]:
# Predict the final submission with CatBoost

y_final = cat.predict_proba(X_test)[:, 1]
df_test["target"] = y_final.round(1)

df_test[["id", "target"]].to_csv("submissioncat.csv", index=False)