In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score,
    roc_auc_score, average_precision_score
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import joblib

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, 
                             recall_score, 
                             precision_score, 
                             f1_score, 
                             roc_auc_score, 
                             average_precision_score,
                             get_scorer)
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

import mlflow
from mlflow.models import infer_signature
import mlflow.sklearn
from sklearn.metrics import get_scorer
from typing import Optional

### Setup logging

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)
logger.info("Starting the model training process...")

2025-06-20 15:24:49,454 [INFO] Starting the model training process...


### Setup MLflow server

*Run the following command using the CLI*
Example:  
```bash
uv run --prerelease=allow mlflow server \
    --host 127.0.0.1 \
    --port 9999 \
    --backend-store-uri src/experiments/db/sqlite:///mlflow.db \
    --default-artifact-root src/experiments/artifacts
```

In [3]:
mlflow.set_tracking_uri("http://localhost:9999")

### Custom functions

In [4]:
# Functions
# Create a python function to gather model metrics and save them to a file
def save_model_metrics(y_true, y_pred, y_proba, model_name):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_proba[:, 1]),
        'average_precision': average_precision_score(y_true, y_proba[:, 1])
    }
    logger.info(f"Model metrics for {model_name}: {metrics}")
    return metrics

# Save the model metrics to a file and log to mlflow
def save_metrics_to_file(metrics, model_name):
    with open(f"{model_name}_metrics.txt", "w") as f:
        for key, value in metrics.items():
            f.write(f"{key}: {value}\n")
            # Only log numeric values to mlflow
            if isinstance(value, (int, float, np.floating)):
                mlflow.log_metric(f"{model_name}_{key}", float(value))
            elif isinstance(value, tuple) and len(value) == 3 and isinstance(value[0], (int, float)) and isinstance(value[2], (int, float)):
                mlflow.log_metric(f"{model_name}_{key}_mean", float(value[0]))
                mlflow.log_metric(f"{model_name}_{key}_std", float(value[2]))
    logger.info(f"Model metrics saved to {model_name}_metrics.txt and logged to mlflow")

# From MLflow documentation
def get_or_create_experiment(experiment_name):
  """
  Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

  This function checks if an experiment with the given name exists within MLflow.
  If it does, the function returns its ID. If not, it creates a new experiment
  with the provided name and returns its ID.

  Parameters:
  - experiment_name (str): Name of the MLflow experiment.

  Returns:
  - str: ID of the existing or newly created MLflow experiment.
  """

  if experiment := mlflow.get_experiment_by_name(experiment_name):
      return experiment.experiment_id
  else:
      return mlflow.create_experiment(experiment_name)

def evaluate_model(model, out_prefix, X_train, y_train, X_test, y_test, X_val = None, y_val = None, folds = None):
    # Initialize the StratifiedKFold object
    if folds:
        skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=99)

    # Perform cross-validation on the training set
    y_pred = cross_val_predict(model, X_train, y_train, cv=skf if folds else None)
    y_proba = model.predict_proba(X_train)
    # Calculate the metrics
    metrics = {
        'accuracy': accuracy_score(y_train, y_pred),
        'recall': recall_score(y_train, y_pred),
        'precision': precision_score(y_train, y_pred),
        'f1': f1_score(y_train, y_pred),
        'roc_auc': roc_auc_score(y_train, y_proba[:, 1]),
        'average_precision': average_precision_score(y_train, y_proba[:, 1])
    }
    # Save the metrics to a file and log to mlflow
    save_metrics_to_file(metrics, f"{out_prefix}_cv")

    # Evaluate the model on the test set
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)
    # Calculate the metrics
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred),
        'f1': f1_score(y_test, y_test_pred),
        'roc_auc': roc_auc_score(y_test, y_test_proba[:, 1]),
        'average_precision': average_precision_score(y_test, y_test_proba[:, 1])
    }
    # Save the metrics to a file and log to mlflow
    save_metrics_to_file(test_metrics, f"{out_prefix}_test")

    # Evaluate the model on the validation set
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)
    # Calculate the metrics
    val_metrics = {
        'accuracy': accuracy_score(y_val, y_val_pred),
        'recall': recall_score(y_val, y_val_pred),
        'precision': precision_score(y_val, y_val_pred),
        'f1': f1_score(y_val, y_val_pred),
        'roc_auc': roc_auc_score(y_val, y_val_proba[:, 1]),
        'average_precision': average_precision_score(y_val, y_val_proba[:, 1])
    }
    # Save the metrics to a file and log to mlflow
    save_metrics_to_file(val_metrics, f"{out_prefix}_val")

    # Calculate the average metrics and standard deviation
    avg_metrics = {
        'accuracy': np.mean([metrics['accuracy'], test_metrics['accuracy'], val_metrics['accuracy']]).astype(float),
        'recall': np.mean([metrics['recall'], test_metrics['recall'], val_metrics['recall']]).astype(float),
        'precision': np.mean([metrics['precision'], test_metrics['precision'], val_metrics['precision']]).astype(float),
        'f1': np.mean([metrics['f1'], test_metrics['f1'], val_metrics['f1']]).astype(float),
        'roc_auc': np.mean([metrics['roc_auc'], test_metrics['roc_auc'], val_metrics['roc_auc']]).astype(float),
        'average_precision': np.mean([metrics['average_precision'], test_metrics['average_precision'], val_metrics['average_precision']]).astype(float)
    }
    std_metrics = {
        'accuracy': np.std([metrics['accuracy'], test_metrics['accuracy'], val_metrics['accuracy']]).astype(float),
        'recall': np.std([metrics['recall'], test_metrics['recall'], val_metrics['recall']]).astype(float),
        'precision': np.std([metrics['precision'], test_metrics['precision'], val_metrics['precision']]).astype(float),
        'f1': np.std([metrics['f1'], test_metrics['f1'], val_metrics['f1']]).astype(float),
        'roc_auc': np.std([metrics['roc_auc'], test_metrics['roc_auc'], val_metrics['roc_auc']]).astype(float),
        'average_precision': np.std([metrics['average_precision'], test_metrics['average_precision'], val_metrics['average_precision']]).astype(float)
    }

    final_metrics = {
        'accuracy': (avg_metrics['accuracy'], '±', std_metrics['accuracy']),
        'recall': (avg_metrics['recall'], '±', std_metrics['recall']),
        'precision': (avg_metrics['precision'], '±', std_metrics['precision']),
        'f1': (avg_metrics['f1'], '±', std_metrics['f1']),
        'roc_auc': (avg_metrics['roc_auc'], '±', std_metrics['roc_auc']),
        'average_precision': (avg_metrics['average_precision'], '±', std_metrics['average_precision'])
    }
    # Save the final metrics to a file and log to mlflow
    save_metrics_to_file(final_metrics, f"{out_prefix}_final")

    return metrics, test_metrics, val_metrics, final_metrics

def train_rf_classifier(
    X_train, y_train,
    X_test, y_test,
    hyperparams: dict,
    model_name: str,
    X_val: Optional = None,
    y_val: Optional = None
):
    """
    Train a RandomForestClassifier and log everything to MLflow.

    Parameters:
    - X_train, y_train: Training data
    - X_test, y_test: Test data
    - X_val, y_val: Optional validation data
    - hyperparams: Dictionary of RandomForest hyperparameters
    - experiment_description: Description to log with the experiment
    - scorer: Metric name to optimize (e.g., 'precision', 'recall', 'f1', 'accuracy')
    """

    # Initialize model
    model = RandomForestClassifier(**hyperparams)

    # Fit model
    model.fit(X_train, y_train)

    # Log hyperparameters
    mlflow.log_params(hyperparams)

    cv_metrics, test_metrics, val_metrics, final_metrics = evaluate_model(model, model_name, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

    # Log the model signature
    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        model,
        artifact_path=model_name,
        signature=signature
    )

    return model, cv_metrics, test_metrics, val_metrics, final_metrics

def train_xgb_classifier(
    X_train, y_train,
    X_test, y_test,
    hyperparams: dict,
    model_name: str,
    X_val: Optional = None,
    y_val: Optional = None
):
    """
    Train an XGBoost classifier and log everything to MLflow.

    Parameters:
    - X_train, y_train: Training data
    - X_test, y_test: Test data
    - X_val, y_val: Optional validation data
    - hyperparams: Dictionary of XGBoost hyperparameters
    - experiment_description: Description to log with the experiment
    - scorer: Metric name to optimize (e.g., 'precision', 'recall', 'f1', 'accuracy')
    """

    # Create a DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)

    # Initialize model
    model = XGBClassifier(**hyperparams)

    # Fit model
    model.fit(dtrain)

    # Log hyperparameters
    mlflow.log_params(hyperparams)

    cv_metrics, test_metrics, val_metrics, final_metrics = evaluate_model(model, model_name, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

    # Log the model signature
    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        model,
        artifact_path=model_name,
        signature=signature
    )

    return model, cv_metrics, test_metrics, val_metrics, final_metrics

In [5]:
# Load positive samples -- lncRNAs
df_pos = pd.read_parquet('/home/chlab/flync/src/data/training_data_ncr_dense.parquet')


# Load negative samples -- protein coding genes
df_neg = pd.read_parquet('/home/chlab/flync/src/data/training_data_pcg_dense.parquet')

# Target column (Binary classification)
label = 'y'

In [6]:
# Drop columns that are not features
cols_not_features = ['chromosome', 'start', 'end', 'transcript_id', 'exon_number', 'gene_name', 'sequence', 'ss_structure', 'cpat_orf_len']

logging.info(f"Columns not used as features: {cols_not_features}")
df_pos = df_pos.drop(columns=cols_not_features)
df_neg = df_neg.drop(columns=cols_not_features)
init_pos_records = df_pos.shape[0]
init_neg_records = df_neg.shape[0]

2025-06-20 15:24:52,427 [INFO] Columns not used as features: ['chromosome', 'start', 'end', 'transcript_id', 'exon_number', 'gene_name', 'sequence', 'ss_structure', 'cpat_orf_len']


In [7]:
# Check if dfs can be merged
if df_pos.shape[1] != df_neg.shape[1]:
    logger.error("DataFrames have different number of columns after dropping non-feature columns.")
    raise ValueError("DataFrames have different number of columns after dropping non-feature columns.")
# Check if the columns are the same
if not all(df_pos.columns == df_neg.columns):
    logger.error("DataFrames have different columns after dropping non-feature columns.")
    raise ValueError("DataFrames have different columns after dropping non-feature columns.")

df = pd.concat([df_pos, df_neg], ignore_index=True)
del df_pos, df_neg
logger.info(f"DataFrames merged. Shape: {df.shape}")
logger.info(f"DataFrame columns: {df.columns.tolist()}")
logger.info(f"DataFrame dtypes: {df.dtypes}")

2025-06-20 15:24:55,918 [INFO] DataFrames merged. Shape: (103902, 8215)
2025-06-20 15:24:55,920 [INFO] DataFrame columns: ['length', 'mean_gc', 'cov_tfbs', 'max_tfbs', 'cov_remap', 'max_remap', 'cov_tss_plus', 'max_tss_plus', 'sum_tss_plus', 'cov_tss_minus', 'min_tss_minus', 'sum_tss_minus', 'cov_s2_pol2', 'sum_s2_pol2', 'max_s2_pol2', 'cov_h3k4me3', 'mean_h3k4me3', 'sum_h3k4me3', 'cov_epdnew', 'max_epdnew', 'mean_pcons27', 'std_pcons27', 'sum_pcons27', 'mean_phylocons124', 'std__phylocons124', 'sum__phylocons124', 'cpat_cod_prob', 'cpat_fickett_score', 'cpat_hexamer_score', 'ss_mfe', '000', '0000', '00000', '000000', '0000000', '00000000', '000000000', '0000000000', '00000000000', '000000000000', '000000000001', '00000000001', '000000000010', '000000000011', '0000000001', '00000000010', '000000000100', '000000000101', '00000000011', '000000000110', '000000000111', '000000001', '0000000010', '00000000100', '000000001000', '000000001001', '00000000101', '000000001010', '000000001011', '

In [8]:
# Remove duplicates
df = df.drop_duplicates()
logger.info(f"Duplicates removed. Shape: {df.shape}")

# Percentage of records dropped due to duplicates
duplicates_percentage = (1 - df.shape[0] / (init_pos_records + init_neg_records)) * 100
logger.info(f"Percentage of records dropped due to duplicates: {duplicates_percentage:.2f}%")

# Count NAs per each column
na_counts = df.isna().sum()
na_counts = na_counts[na_counts > 0]
if not na_counts.empty:
    logger.info(f"Columns with NAs: \n{na_counts}")
else:
    logger.info("No NAs found in the DataFrame.")

2025-06-20 15:25:03,669 [INFO] Duplicates removed. Shape: (103902, 8215)
2025-06-20 15:25:03,670 [INFO] Percentage of records dropped due to duplicates: 0.00%
2025-06-20 15:25:04,019 [INFO] Columns with NAs: 
cov_tss_plus           1498
max_tss_plus           1498
sum_tss_plus           1802
cov_tss_minus          1487
min_tss_minus          1487
sum_tss_minus          1744
cov_s2_pol2            8012
sum_s2_pol2            8120
max_s2_pol2            8012
cov_h3k4me3            9026
mean_h3k4me3           9026
sum_h3k4me3           11899
cov_epdnew              151
max_epdnew              151
mean_pcons27            181
std_pcons27             181
sum_pcons27             229
mean_phylocons124        20
std__phylocons124        22
sum__phylocons124        39
cpat_cod_prob          7250
cpat_fickett_score     7250
cpat_hexamer_score     7250
ss_mfe              20554
dtype: int64


In [9]:
# Logic for data cleaning:
# 1. If columns are UCSC (BigWig or BigBed) statistics features or CPAT scores (no ORFs found), fill NAs with 0
logic_op1 = (df.columns.str.startswith(tuple(['min_', 'max_', 'mean_', 'std_', 'sum_', 'cov_'])) | df.columns.str.startswith('cpat_'))
for col in df.columns:
    if col in df.columns[logic_op1]:
        if df[col].isna().sum() > 0:
            logger.info(f"Filling NAs in column {col} with 0")
            df[col] = df[col].fillna(0)

# 2. If columns are ss_* features, drop rows with NAs. This seems to be the best approach as no structure was calculated for the sequence
logic_op2 = df.columns.str.startswith('ss_')
for col in df.columns:
    if col in df.columns[logic_op2]:
        if df[col].isna().sum() > 0:
            logger.info(f"Dropping rows with NAs in column {col}")
            df = df.dropna(subset=[col])
            logger.info(f"Rows with NAs in column {col} dropped. Shape: {df.shape}")
        # Drop values where `ss_mfe` is not < 0
        if col == 'ss_mfe':
            df = df[df[col] < 0]
            logger.info(f"Rows with ss_mfe >= 0 dropped. Shape: {df.shape}")

# 3. If columns start with '0' or '1', drop rows with NAs. This seems to be the best approach as we have no counts for all required k-mers
logic_op3 = df.columns.str.startswith(('0', '1'))
for col in df.columns:
    if col in df.columns[logic_op3]:
        if df[col].isna().sum() > 0:
            logger.info(f"Dropping rows with NAs in column {col}")
            df = df.dropna(subset=[col])
            logger.info(f"Rows with NAs in column {col} dropped. Shape: {df.shape}")

2025-06-20 15:25:04,029 [INFO] Filling NAs in column cov_tss_plus with 0
2025-06-20 15:25:04,031 [INFO] Filling NAs in column max_tss_plus with 0
2025-06-20 15:25:04,032 [INFO] Filling NAs in column sum_tss_plus with 0
2025-06-20 15:25:04,034 [INFO] Filling NAs in column cov_tss_minus with 0
2025-06-20 15:25:04,035 [INFO] Filling NAs in column min_tss_minus with 0
2025-06-20 15:25:04,036 [INFO] Filling NAs in column sum_tss_minus with 0
2025-06-20 15:25:04,037 [INFO] Filling NAs in column cov_s2_pol2 with 0
2025-06-20 15:25:04,039 [INFO] Filling NAs in column sum_s2_pol2 with 0
2025-06-20 15:25:04,040 [INFO] Filling NAs in column max_s2_pol2 with 0
2025-06-20 15:25:04,041 [INFO] Filling NAs in column cov_h3k4me3 with 0
2025-06-20 15:25:04,042 [INFO] Filling NAs in column mean_h3k4me3 with 0
2025-06-20 15:25:04,043 [INFO] Filling NAs in column sum_h3k4me3 with 0
2025-06-20 15:25:04,044 [INFO] Filling NAs in column cov_epdnew with 0
2025-06-20 15:25:04,046 [INFO] Filling NAs in column ma

In [10]:
# Compare the current postive and negative instances with the ones before data cleaning
final_pos_records = df[df['y'] == 1].shape[0]
final_neg_records = df[df['y'] == 0].shape[0]
logger.info(f"Positive records before data cleaning: {init_pos_records}, after: {final_pos_records}")
logger.info(f"Negative records before data cleaning: {init_neg_records}, after: {final_neg_records}")
# Log percentage of positive and negative records lost
pos_records_lost = init_pos_records - final_pos_records
neg_records_lost = init_neg_records - final_neg_records
if init_pos_records > 0:
    logger.info(f"Percentage of positive records lost: {pos_records_lost / init_pos_records * 100:.2f}%")
if init_neg_records > 0:
    logger.info(f"Percentage of negative records lost: {neg_records_lost / init_neg_records * 100:.2f}%")

2025-06-20 15:25:13,621 [INFO] Positive records before data cleaning: 5455, after: 5388
2025-06-20 15:25:13,622 [INFO] Negative records before data cleaning: 98447, after: 77631
2025-06-20 15:25:13,623 [INFO] Percentage of positive records lost: 1.23%
2025-06-20 15:25:13,623 [INFO] Percentage of negative records lost: 21.14%


In [11]:
X = df.drop(columns=[label])
y = df[label]

In [12]:
# Train-test-validation split
# 70% train, 15% validation, 15% test

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=99)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=99)

logger.info(f"Train set shape: {X_train.shape}, {y_train.shape}")
logger.info(f"Validation set shape: {X_val.shape}, {y_val.shape}")
logger.info(f"Test set shape: {X_test.shape}, {y_test.shape}")
# Check if the target variable is balanced
logger.info(f"Train set target variable distribution: {y_train.value_counts(normalize=True)}")
logger.info(f"Validation set target variable distribution: {y_val.value_counts(normalize=True)}")
logger.info(f"Test set target variable distribution: {y_test.value_counts(normalize=True)}")

2025-06-20 15:25:19,060 [INFO] Train set shape: (58113, 8214), (58113,)
2025-06-20 15:25:19,061 [INFO] Validation set shape: (12453, 8214), (12453,)
2025-06-20 15:25:19,062 [INFO] Test set shape: (12453, 8214), (12453,)
2025-06-20 15:25:19,063 [INFO] Train set target variable distribution: y
False    0.935092
True     0.064908
Name: proportion, dtype: float64
2025-06-20 15:25:19,064 [INFO] Validation set target variable distribution: y
False    0.935116
True     0.064884
Name: proportion, dtype: float64
2025-06-20 15:25:19,065 [INFO] Test set target variable distribution: y
False    0.935116
True     0.064884
Name: proportion, dtype: float64


### Hyperparameter tunning

In [None]:
with mlflow.start_run(experiment_id=get_or_create_experiment("rf_model")):

    # Preform a Grid Search for hyperparameters of Random Forest
    from sklearn.model_selection import GridSearchCV
    param_grid = {
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [2, 5, 10, 20, 50],
        'min_samples_split': [2, 3, 5, 10],
        'max_leaf_nodes': [None, 5, 10, 20],
        'min_samples_leaf': [1, 2, 5, 10],
        'max_features': ['sqrt', 'log2'],
        'class_weight': ['balanced', None],
        'criterion': ['gini', 'entropy'],
        'bootstrap': [True, False],
        'oob_score': [True, False],
        'ccp_alpha': [0.0, 0.01, 0.1],
    }
    # Initialize the model
    rf = RandomForestClassifier(random_state=99, n_jobs=16)
    # Initialize the GridSearchCV object
    grid_search = GridSearchCV(
        rf,
        param_grid,
        scoring='precision',
        cv=5,
        n_jobs=16,
        verbose=2
    )
    # Fit the model
    grid_search.fit(X_train, y_train)
    # Get the best parameters
    best_params = grid_search.best_params_
    logger.info(f"Best parameters: {best_params}")
    # Get the best model
    best_model = grid_search.best_estimator_

    # Log to mlflow
    mlflow.log_params(best_params)
    mlflow.sklearn.log_model(
        best_model,
        artifact_path="rf_model",
        signature=infer_signature(X_train, best_model.predict(X_train))
    )
    # Evaluate the model
    cv_metrics, test_metrics, val_metrics, final_metrics = evaluate_model(
        best_model,
        "rf_model",
        X_train,
        y_train,
        X_test,
        y_test,
        X_val,
        y_val
    )

Fitting 5 folds for each of 122880 candidates, totalling 614400 fits
🏃 View run puzzled-stork-984 at: http://localhost:9999/#/experiments/1/runs/b2ed6623a6464c529057fc627891b627
🧪 View experiment at: http://localhost:9999/#/experiments/1


# Evaluate which model algorithms should be considered for further testing

In [None]:
# from lazypredict.Supervised import LazyClassifier

# # Split the data into Train, Test and Validation sets
# # 70% for training, 15% for testing, 15% for validation
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
# logger.info(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}, Validation set shape: {X_val.shape}")

In [None]:
# df_lz = LazyClassifier(predictions=True)
# # Fit the model
# models = df_lz.fit(X_train, X_test, y_train, y_test)
# # Get the best model
# best_model = models[0].iloc[0]
# logger.info(f"Best model: {best_model['Model']}")

# Imbalanced training set
Since there are a much less positive instances (lncRNAs) than negative instances (protein coding) we can
1. Oversample the minority class -- lncRNAs
2. Undersample the majority class -- protein coding
3. Use imbalanced-resistant machine-learning algorithms (Random Forests)

## 1. Use SMOTE to oversample lncRNA instances

In [None]:
# Perform oversampling of the minority class
smote = SMOTE(random_state=99)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
logger.info(f"Resampled dataset shape: {X_smote.shape}, {y_smote.shape}")
# Split the data into training and testing sets

In [None]:
# Train a Random Forest classifier
output_prefix = 'rf_model_smote'

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)
rf.fit(X_smote, y_smote)
logger.info("Random Forest classifier trained.")
# Save the model
joblib.dump(rf, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

evaluate_model(rf, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

In [None]:
output_prefix = 'xgb_model_smote'

# train an XGBoost classifier
from xgboost import XGBClassifier
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)

xgb.fit(X_smote, y_smote)
logger.info("XGBoost classifier trained.")
# Save the model
joblib.dump(xgb, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

In [None]:
evaluate_model(xgb, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

## 2. Oversampling and Undersample

In [None]:
# Use SMOTETomek to balance the dataset

smote_tomek = SMOTETomek(random_state=99)
X_smote_tomek, y_smote_tomek = smote_tomek.fit_resample(X_train, y_train)
logger.info(f"Resampled dataset shape: {X_smote_tomek.shape}, {y_smote_tomek.shape}")


In [None]:
output_prefix = 'rf_model_smote_tomek'
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)
rf.fit(X_smote_tomek, y_smote_tomek)
logger.info("Random Forest classifier trained.")
# Save the model
joblib.dump(rf, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

In [None]:
evaluate_model(rf, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

In [None]:
output_prefix = 'xgb_model_smote_tomek'
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)
xgb.fit(X_smote_tomek, y_smote_tomek)
logger.info("XGBoost classifier trained.")
# Save the model
joblib.dump(xgb, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

In [None]:
evaluate_model(rf, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

## 3. Do not change the data and use class wieghts to train the model

In [None]:
output_prefix = 'rf_model'

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    class_weight='balanced',
    n_jobs=16
)
rf.fit(X_train, y_train)
logger.info("Random Forest classifier trained.")
# Save the model
joblib.dump(rf, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

In [None]:
evaluate_model(rf, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

In [None]:
output_prefix = 'xgb_model'

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
logger.info(f"Scale pos weight: {scale_pos_weight}")

xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    scale_pos_weight=scale_pos_weight,
    n_jobs=16
)
xgb.fit(X_train, y_train)
logger.info("XGBoost classifier trained.")
# Save the model
joblib.dump(xgb, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

In [None]:
evaluate_model(xgb, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)