In [7]:
from preprocessing import get_all_configurations, PREPROCESSING_CONFIGURATIONS
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from config import KAGGLE_DATA_PATH, KAGGLE_IGNORED_LABELS, KAGGLE_TARGET
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from tqdm import tqdm
import numpy as np
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, learning_curve, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
import warnings

warnings.simplefilter(action="ignore", category=Warning)


def _generate_config_key(config):
    return " + ".join(func.__name__ for func in config)


def _prepare_data(
    datapath,
    target_label,
    columns_to_ignore=None,
    labels_to_ignore=None,
    test_size=0.2,
    random_state=42,
):
    # Read the dataset
    df = pd.read_csv(datapath)

    # Drop specified columns
    if columns_to_ignore:
        df.drop(columns=columns_to_ignore, inplace=True)

    # Drop rows with invalid categories in the target label
    if labels_to_ignore:
        df = df[~df[target_label].isin(labels_to_ignore)]

    # Check if target variable is categorical and convert to numerical if true
    if df[target_label].dtype == "object":
        le = LabelEncoder()
        df[target_label] = le.fit_transform(df[target_label])

    # Split features and target variable
    X = df.drop(columns=target_label)
    y = df[target_label]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    return X_train, y_train, X_test, y_test


def _apply_techniques(config, X_train_, y_train, X_test, y_test):
    outliers_detection_technique = config[0]
    features_selection_technique = config[1]
    oversampling_technique = config[2]

    # Apply outlier removal
    X_train_, y_train = outliers_detection_technique(X_train_, y_train)

    # Apply features selection
    X_train_, y_train, X_test, y_test = features_selection_technique(
        X_train_, y_train, X_test, y_test
    )

    # Apply oversampling
    X_train_, y_train = oversampling_technique(X_train_, y_train)

    return X_train_, y_train, X_test, y_test


def get_data_for_config(config):
    X_train, y_train, X_test, y_test = _prepare_data(
        KAGGLE_DATA_PATH,
        KAGGLE_TARGET,
        labels_to_ignore=KAGGLE_IGNORED_LABELS,
        test_size=0.8,
    )
    X_train, y_train, X_test, y_test = _apply_techniques(
        config, X_train, y_train, X_test, y_test
    )
    # print(f"Size of X_train: {X_train.shape}")
    # print(f"Size of y_train: {y_train.shape}")
    # print(f"Size of X_test: {X_test.shape}")
    # print(f"Size of y_test: {y_test.shape}")
    # # Printing proportions
    # train_counts = y_train.value_counts(normalize=True)
    # test_counts = y_test.value_counts(normalize=True)

    # print(
    #     f"Proportion in y_train (positive:negative): {train_counts.get(1, 0):.2f} : {train_counts.get(0, 0):.2f}"
    # )
    # print(
    #     f"Proportion in y_test (positive:negative): {test_counts.get(1, 0):.2f} : {test_counts.get(0, 0):.2f}"
    # )
    return X_train, y_train, X_test, y_test

In [8]:
# Define the metrics
def compute_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "confusion_matrix": confusion_matrix(y_true, y_pred),
    }


# Training function
def train_classifiers(X_train, y_train, X_test, y_test, config_key):
    results = {}
    for classifier_name, classifier_info in classifiers.items():
        try:
            clf = classifier_info["model"]
            clf.fit(X_train, y_train)
            predictions = clf.predict(X_test)
            metrics = compute_metrics(y_test, predictions)

            results[classifier_name] = {
                "metrics": metrics,
                "data": {
                    "X_train": X_train,
                    "y_train": y_train,
                    "X_test": X_test,
                    "y_test": y_test,
                    "y_pred": predictions,
                },
            }
        except Exception as e:
            print(
                f"{classifier_name} failed to train with configuration {config_key} because: {e}"
            )
    return results


classifiers = {
    "RandomForest": {
        "model": RandomForestClassifier(),
    },
    "K-nearest-neighbor": {
        "model": KNeighborsClassifier(),
    },
    "Artificial Neural Network": {
        "model": MLPClassifier(),
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
    },
    "Logistic Regression": {
        "model": LogisticRegression(),
    },
    "Support Vector Machine": {
        "model": SVC(),  # https: //www.kaggle.com/code/sunayanagawde/ml-algorithms-usage-and-prediction?scriptVersionId=120249289&cellId=62
    },
    "Naive Bayes": {
        "model": GaussianNB(),
    },
    "XG-boost": {
        "model": XGBClassifier(),
    },
}

# Main dictionary to store results
all_results = {}

for config in tqdm(PREPROCESSING_CONFIGURATIONS):
    try:
        config_key = _generate_config_key(config)
        X_train, y_train, X_test, y_test = get_data_for_config(config)
        all_results[config_key] = train_classifiers(
            X_train, y_train, X_test, y_test, config_key
        )
    except Exception as e:
        print(f"{config_key} is invalid for this dataset because: {e}")

# Serialize results
with open(f"all_results.pkl", "wb") as f:
    pickle.dump(all_results, f)

  0%|          | 0/180 [00:00<?, ?it/s]

K-nearest-neighbor failed to train with configuration remove_outliers_none + features_selection_none + oversampling_none because: 'Flags' object has no attribute 'c_contiguous'


  1%|          | 1/180 [00:01<03:04,  1.03s/it]

K-nearest-neighbor failed to train with configuration remove_outliers_none + features_selection_none + oversampling_smote because: 'Flags' object has no attribute 'c_contiguous'


  1%|          | 2/180 [00:02<03:25,  1.15s/it]

K-nearest-neighbor failed to train with configuration remove_outliers_none + features_selection_none + oversampling_svm_smote because: 'Flags' object has no attribute 'c_contiguous'


  2%|▏         | 3/180 [00:03<03:27,  1.17s/it]

K-nearest-neighbor failed to train with configuration remove_outliers_none + features_selection_none + oversampling_adasyn because: 'Flags' object has no attribute 'c_contiguous'


  2%|▏         | 4/180 [00:04<03:28,  1.19s/it]

K-nearest-neighbor failed to train with configuration remove_outliers_none + features_selection_none + oversampling_smote_borderline because: 'Flags' object has no attribute 'c_contiguous'


  3%|▎         | 5/180 [00:05<03:29,  1.20s/it]

remove_outliers_none + features_selection_none + oversampling_nc_smote is invalid for this dataset because: SMOTE-NC is not designed to work only with numerical features. It requires some categorical features.


  6%|▌         | 10/180 [00:10<03:06,  1.10s/it]