In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("final_features.csv")
df.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

def avg_and_std(scores):
    return np.mean(scores), np.std(scores)

# Function for k-fold cross-validation with optional PCA
def k_fold_cv(model, df, metric_funcs, n_splits=5):
    X = df.iloc[:, 2:].values  # Features
    y = df.iloc[:, 1].values   # Target

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = {metric_func.__name__: [] for metric_func in metric_funcs}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Get predictions
        y_pred = model.predict_proba(X_test)[:, 1]
        
        # Calculate and store each metric
        for metric_func in metric_funcs:
            score = metric_func(y_test, y_pred)
            scores[metric_func.__name__].append(score)
    
    return scores

In [None]:
best_auc = 0
best_params = {}

# Parameter combination test
for crit in ["gini", "entropy", "log_loss"]:
    for n_est in range(25, 201, 25):
        for m_depth in range(5, 106, 10):
            for m_samples_leaf in range(5, 26, 5):
                params = {
                    'n_estimators': n_est,
                    'max_depth': m_depth,
                    'min_samples_leaf': m_samples_leaf,
                    'criterion': crit
                }
                
                # Initialize model with current parameters
                rf_model = RandomForestClassifier(**params)
                
                # Perform cross-validation and get AUC scores
                score = k_fold_cv(model=rf_model, df=df, metric_funcs=[roc_auc_score])
                avg_auc, std_auc = avg_and_std(np.array(score["roc_auc_score"]))
                
                # Update best parameters if new best AUC is found
                if avg_auc > best_auc:
                    best_auc = avg_auc
                    best_params = params
                    print("New best parameter combination found!")
                    for parameter, value in best_params.items():
                        print(f"\t{parameter}: {value}")
                    print(f"Best Average AUC: {best_auc * 100:.6f}%\n")
                else:
                    print("Parameter combination tested:")
                    for parameter, value in params.items():
                        print(f"\t{parameter}: {value}")
                    print(f"Average AUC: {avg_auc * 100:.6f}%\n")

In [None]:
# Display final best parameters and AUC
print("Best parameter combination:")
for parameter, value in best_params.items():
    print(f"\t{parameter}: {value}")
print(f"Best Average AUC: {best_auc * 100:.2f}%")