### Load data

In [6]:
import pandas as pd

# 1. Load dataset
df = pd.read_csv('/teamspace/studios/this_studio/checkpoints/embedding_df/id44-100_img-normalized=False.csv')

X = df.drop('label', axis=1)
y = df['label']

### Metric evaluation

In [7]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

def evaluate_model(
    model
) -> None:
    """Cross validation for given model"""
    # Set up Stratified K-Fold cross-validation
    kf = StratifiedKFold(n_splits=5)

    # Lists to store metric results
    precision_macro_scores = []
    precision_micro_scores = []
    recall_macro_scores = []
    recall_micro_scores = []
    f1_macro_scores = []
    f1_micro_scores = []
    balanced_accuracy_scores = []

    # Cross-validation loop
    for train_index, test_index in tqdm(kf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # # Preprocessing - hyperparams
        # scaler = StandardScaler()
        # X = scaler.fit_transform(X)

        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on the test fold
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        precision_macro = precision_score(y_test, y_pred, average='macro')
        precision_micro = precision_score(y_test, y_pred, average='micro')
        recall_macro = recall_score(y_test, y_pred, average='macro')
        recall_micro = recall_score(y_test, y_pred, average='micro')
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_micro = f1_score(y_test, y_pred, average='micro')
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

        # Store metrics for this fold
        precision_macro_scores.append(precision_macro)
        precision_micro_scores.append(precision_micro)
        recall_macro_scores.append(recall_macro)
        recall_micro_scores.append(recall_micro)
        f1_macro_scores.append(f1_macro)
        f1_micro_scores.append(f1_micro)
        balanced_accuracy_scores.append(balanced_accuracy)

        # # Print classification report for each fold
        # print(f"Classification Report for Fold {len(precision_macro_scores)}:")
        # print(classification_report(y_test, y_pred))
        # print("\n")
        
    # Calculate and print the average metrics across all folds
    print("Average Metrics Across All Folds:")
    print('\n', "="*50, '\n')
    print("Precision (Macro):", np.mean(precision_macro_scores))
    print("Recall (Macro):", np.mean(recall_macro_scores))
    print("F1-Score (Macro):", np.mean(f1_macro_scores))
    print('\n', "="*50, '\n')
    print("Precision (Micro):", np.mean(precision_micro_scores))
    print("Recall (Micro):", np.mean(recall_micro_scores))
    print("F1-Score (Micro):", np.mean(f1_micro_scores))
    print('\n', "="*50, '\n')
    print("Balanced Accuracy:", np.mean(balanced_accuracy_scores))


### Model development

In [8]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# ------ hyperparams -----
# classifier 
model = XGBClassifier(learning_rate=0.1, max_depth=9, min_child_weight=6, n_estimators=100, n_jobs=1, subsample=0.45, verbosity=0)
# ------ hyperparams -----

In [9]:
evaluate_model(model)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
5it [00:55, 11.18s/it]

Average Metrics Across All Folds:


Precision (Macro): 0.4009651441911177
Recall (Macro): 0.37757839390185083
F1-Score (Macro): 0.3814255389978015


Precision (Micro): 0.6482832976658232
Recall (Micro): 0.6482832976658232
F1-Score (Micro): 0.6482832976658232


Balanced Accuracy: 0.37757839390185083





# AutoML

## TPOT

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, make_scorer, f1_score

# 1. Load dataset
df = pd.read_csv('/teamspace/studios/this_studio/checkpoints/embedding_df/id44-100_img-normalized=False.csv')

X = df.drop('label', axis=1)
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define a custom scoring function (e.g., F1 macro)
f1_macro_scorer = make_scorer(f1_score, average='macro')

# Set up TPOT with parallelism and scoring for F1 macro
tpot = TPOTClassifier(
    generations=5,             # Number of generations to run
    population_size=20,         # Number of pipelines to try per generation
    scoring=f1_macro_scorer,    # Optimize for F1 macro score
    cv=5,                       # 5-fold cross-validation
    n_jobs=-1,                  # Use all available CPU cores
    random_state=42,
    verbosity=2                 # Verbosity level for tracking progress
)

# Fit TPOT to the training data
tpot.fit(X_train, y_train)

# Make predictions on the test set and evaluate performance
y_pred = tpot.predict(X_test)
print("Test F1 Macro Score:", f1_score(y_test, y_pred, average='macro'))

# Export the best pipeline
tpot.export('best_model_pipeline.py')
tpot.fit(X_train, y_train)

# Export the best model
tpot.export('best_model_pipeline.py')

## PyCaret

In [21]:
from pycaret.classification import *

ImportError: cannot import name 'threadpool_info' from 'sklearn.utils.fixes' (/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/sklearn/utils/fixes.py)

In [17]:
from pycaret.classification import *

# Initialize the classification setup
clf = setup(
    data=df,
    target='label',  # Specify your target column here
    session_id=123,               # For reproducibility
    normalize=True,               # Optional: normalize features
    feature_selection=True        # Optional: perform feature selection
)


ImportError: cannot import name 'threadpool_info' from 'sklearn.utils.fixes' (/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/sklearn/utils/fixes.py)