In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score,
    roc_auc_score, average_precision_score
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import joblib

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)
logger.info("Starting the model training process...")

2025-05-21 17:00:33,716 [INFO] Starting the model training process...


In [3]:
# Functions
# Create a python function to gather model metrics and save them to a file
def save_model_metrics(y_true, y_pred, y_proba, model_name):
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_proba[:, 1]),
        'average_precision': average_precision_score(y_true, y_proba[:, 1])
    }
    logger.info(f"Model metrics for {model_name}: {metrics}")
    return metrics
# Save the model metrics to a file
def save_metrics_to_file(metrics, model_name):
    with open(f"{model_name}_metrics.txt", "w") as f:
        for key, value in metrics.items():
            f.write(f"{key}: {value}\n")
    logger.info(f"Model metrics saved to {model_name}_metrics.txt")

def evaluate_model(model, out_prefix, X_train, y_train, X_test, y_test, X_val = None, y_val = None, folds = None):

    # Initialize the StratifiedKFold object
    if folds:
        skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=99)

    # Perform cross-validation on the training set
    y_pred = cross_val_predict(model, X_train, y_train, cv=skf if folds else None)
    y_proba = model.predict_proba(X_train)
    # Calculate the metrics
    metrics = {
        'accuracy': accuracy_score(y_train, y_pred),
        'recall': recall_score(y_train, y_pred),
        'precision': precision_score(y_train, y_pred),
        'f1': f1_score(y_train, y_pred),
        'roc_auc': roc_auc_score(y_train, y_proba[:, 1]),
        'average_precision': average_precision_score(y_train, y_proba[:, 1])
    }
    # Save the metrics to a file
    save_metrics_to_file(metrics, f"{out_prefix}_cv")
    
    # Evaluate the model on the test set
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)
    # Calculate the metrics
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred),
        'f1': f1_score(y_test, y_test_pred),
        'roc_auc': roc_auc_score(y_test, y_test_proba[:, 1]),
        'average_precision': average_precision_score(y_test, y_test_proba[:, 1])
    }
    # Save the metrics to a file
    save_metrics_to_file(test_metrics, f"{out_prefix}_test")

    # Evaluate the model on the validation set
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)
    # Calculate the metrics
    val_metrics = {
        'accuracy': accuracy_score(y_val, y_val_pred),
        'recall': recall_score(y_val, y_val_pred),
        'precision': precision_score(y_val, y_val_pred),
        'f1': f1_score(y_val, y_val_pred),
        'roc_auc': roc_auc_score(y_val, y_val_proba[:, 1]),
        'average_precision': average_precision_score(y_val, y_val_proba[:, 1])
    }
    # Save the metrics to a file
    save_metrics_to_file(val_metrics, f"{out_prefix}_val")

    # Calculate the average metrics and standard deviation
    avg_metrics = {
        'accuracy': np.mean([metrics['accuracy'], test_metrics['accuracy'], val_metrics['accuracy']]),
        'recall': np.mean([metrics['recall'], test_metrics['recall'], val_metrics['recall']]),
        'precision': np.mean([metrics['precision'], test_metrics['precision'], val_metrics['precision']]),
        'f1': np.mean([metrics['f1'], test_metrics['f1'], val_metrics['f1']]),
        'roc_auc': np.mean([metrics['roc_auc'], test_metrics['roc_auc'], val_metrics['roc_auc']]),
        'average_precision': np.mean([metrics['average_precision'], test_metrics['average_precision'], val_metrics['average_precision']])
    }
    std_metrics = {
        'accuracy': np.std([metrics['accuracy'], test_metrics['accuracy'], val_metrics['accuracy']]),
        'recall': np.std([metrics['recall'], test_metrics['recall'], val_metrics['recall']]),
        'precision': np.std([metrics['precision'], test_metrics['precision'], val_metrics['precision']]),
        'f1': np.std([metrics['f1'], test_metrics['f1'], val_metrics['f1']]),
        'roc_auc': np.std([metrics['roc_auc'], test_metrics['roc_auc'], val_metrics['roc_auc']]),
        'average_precision': np.std([metrics['average_precision'], test_metrics['average_precision'], val_metrics['average_precision']])
    }

    final_metrics = {
        'accuracy': (avg_metrics['accuracy'], '±', std_metrics['accuracy']),
        'recall': (avg_metrics['recall'], '±', std_metrics['recall']),
        'precision': (avg_metrics['precision'], '±', std_metrics['precision']),
        'f1': (avg_metrics['f1'], '±', std_metrics['f1']),
        'roc_auc': (avg_metrics['roc_auc'], '±', std_metrics['roc_auc']),
        'average_precision': (avg_metrics['average_precision'], '±', std_metrics['average_precision'])
    }
    # Save the final metrics to a file
    save_metrics_to_file(final_metrics, f"{out_prefix}_final")

In [4]:
# Load positive samples -- lncRNAs
df_pos = pd.read_parquet('/home/chlab/flync/src/data/ncr_dim_redux.parquet')


# Load negative samples -- protein coding genes
df_neg = pd.read_parquet('/home/chlab/flync/src/data/pcg_dim_redux.parquet')

# Target column (Binary classification)
label = 'y'

In [5]:
# Drop columns that are not features
cols_not_features = ['chromosome', 'start', 'end', 'transcript_id', 'exon_number', 'gene_name', 'sequence', 'vrna_structure', 'cpat_orf_len']

logging.info(f"Columns not used as features: {cols_not_features}")
df_pos = df_pos.drop(columns=cols_not_features)
df_neg = df_neg.drop(columns=cols_not_features)
init_pos_records = df_pos.shape[0]
init_neg_records = df_neg.shape[0]

2025-05-21 17:00:36,399 [INFO] Columns not used as features: ['chromosome', 'start', 'end', 'transcript_id', 'exon_number', 'gene_name', 'sequence', 'vrna_structure', 'cpat_orf_len']


In [6]:
# Check if dfs can be merged
if df_pos.shape[1] != df_neg.shape[1]:
    logger.error("DataFrames have different number of columns after dropping non-feature columns.")
    raise ValueError("DataFrames have different number of columns after dropping non-feature columns.")
# Check if the columns are the same
if not all(df_pos.columns == df_neg.columns):
    logger.error("DataFrames have different columns after dropping non-feature columns.")
    raise ValueError("DataFrames have different columns after dropping non-feature columns.")

df = pd.concat([df_pos, df_neg], ignore_index=True)
del df_pos, df_neg
logger.info(f"DataFrames merged. Shape: {df.shape}")
logger.info(f"DataFrame columns: {df.columns.tolist()}")
logger.info(f"DataFrame dtypes: {df.dtypes}")

2025-05-21 17:00:36,543 [INFO] DataFrames merged. Shape: (103902, 41)
2025-05-21 17:00:36,544 [INFO] DataFrame columns: ['length', 'mean_gc', 'cov_tfbs', 'max_tfbs', 'cov_remap', 'max_remap', 'cov_tss_plus', 'max_tss_plus', 'sum_tss_plus', 'cov_tss_minus', 'min_tss_minus', 'sum_tss_minus', 'cov_s2_pol2', 'sum_s2_pol2', 'max_s2_pol2', 'cov_h3k4me3', 'mean_h3k4me3', 'sum_h3k4me3', 'cov_epdnew', 'max_epdnew', 'mean_pcons27', 'std_pcons27', 'sum_pcons27', 'mean_phylocons124', 'std__phylocons124', 'sum__phylocons124', 'cpat_cod_prob', 'cpat_fickett_score', 'cpat_hexamer_score', 'vrna_mfe', '3mer_SVD1', '4mer_SVD1', '5mer_SVD1', '6mer_SVD1', '7mer_SVD1', '8mer_SVD1', '9mer_SVD1', '10mer_SVD1', '11mer_SVD1', '12mer_SVD1', 'y']
2025-05-21 17:00:36,547 [INFO] DataFrame dtypes: length                  int64
mean_gc               float64
cov_tfbs              float64
max_tfbs              float64
cov_remap             float64
max_remap             float64
cov_tss_plus          float64
max_tss_plu

In [7]:
# Remove duplicates
df = df.drop_duplicates()
logger.info(f"Duplicates removed. Shape: {df.shape}")

# Percentage of records dropped due to duplicates
duplicates_percentage = (1 - df.shape[0] / (init_pos_records + init_neg_records)) * 100
logger.info(f"Percentage of records dropped due to duplicates: {duplicates_percentage:.2f}%")

# Count NAs per each column
na_counts = df.isna().sum()
na_counts = na_counts[na_counts > 0]
if not na_counts.empty:
    logger.info(f"Columns with NAs: \n{na_counts}")
else:
    logger.info("No NAs found in the DataFrame.")

2025-05-21 17:00:36,959 [INFO] Duplicates removed. Shape: (103902, 41)
2025-05-21 17:00:36,961 [INFO] Percentage of records dropped due to duplicates: 0.00%
2025-05-21 17:00:36,980 [INFO] Columns with NAs: 
cov_tss_plus           1498
max_tss_plus           1498
sum_tss_plus           1802
cov_tss_minus          1487
min_tss_minus          1487
sum_tss_minus          1744
cov_s2_pol2            8012
sum_s2_pol2            8120
max_s2_pol2            8012
cov_h3k4me3            9026
mean_h3k4me3           9026
sum_h3k4me3           11899
cov_epdnew              151
max_epdnew              151
mean_pcons27            181
std_pcons27             181
sum_pcons27             229
mean_phylocons124        20
std__phylocons124        22
sum__phylocons124        39
cpat_cod_prob          7250
cpat_fickett_score     7250
cpat_hexamer_score     7250
vrna_mfe              19447
dtype: int64


In [8]:
# Logic for data cleaning:
# 1. If columns are UCSC (BigWig or BigBed) statistics features or CPAT scores (no ORFs found), fill NAs with 0
logic_op1 = (df.columns.str.startswith(tuple(['min_', 'max_', 'mean_', 'std_', 'sum_', 'cov_'])) | df.columns.str.startswith('cpat_'))
for col in df.columns:
    if col in df.columns[logic_op1]:
        if df[col].isna().sum() > 0:
            logger.info(f"Filling NAs in column {col} with 0")
            df[col] = df[col].fillna(0)

# 2. If columns are vrna_* features, drop rows with NAs. This seems to be the best approach as no structure was calculated for the sequence
logic_op2 = df.columns.str.startswith('vrna_')
for col in df.columns:
    if col in df.columns[logic_op2]:
        if df[col].isna().sum() > 0:
            logger.info(f"Dropping rows with NAs in column {col}")
            df = df.dropna(subset=[col])
            logger.info(f"Rows with NAs in column {col} dropped. Shape: {df.shape}")
        # Drop values where `vrna_mfe` is not < 0
        if col == 'vrna_mfe':
            df = df[df[col] < 0]
            logger.info(f"Rows with vrna_mfe >= 0 dropped. Shape: {df.shape}")

# 3. If columns start with '0' or '1', drop rows with NAs. This seems to be the best approach as we have no counts for all required k-mers
logic_op3 = df.columns.str.contains('SVD')
for col in df.columns:
    if col in df.columns[logic_op3]:
        if df[col].isna().sum() > 0:
            logger.info(f"Dropping rows with NAs in column {col}")
            df = df.dropna(subset=[col])
            logger.info(f"Rows with NAs in column {col} dropped. Shape: {df.shape}")

2025-05-21 17:00:37,007 [INFO] Filling NAs in column cov_tss_plus with 0
2025-05-21 17:00:37,011 [INFO] Filling NAs in column max_tss_plus with 0
2025-05-21 17:00:37,017 [INFO] Filling NAs in column sum_tss_plus with 0
2025-05-21 17:00:37,022 [INFO] Filling NAs in column cov_tss_minus with 0
2025-05-21 17:00:37,027 [INFO] Filling NAs in column min_tss_minus with 0
2025-05-21 17:00:37,033 [INFO] Filling NAs in column sum_tss_minus with 0
2025-05-21 17:00:37,040 [INFO] Filling NAs in column cov_s2_pol2 with 0
2025-05-21 17:00:37,045 [INFO] Filling NAs in column sum_s2_pol2 with 0
2025-05-21 17:00:37,049 [INFO] Filling NAs in column max_s2_pol2 with 0
2025-05-21 17:00:37,052 [INFO] Filling NAs in column cov_h3k4me3 with 0
2025-05-21 17:00:37,056 [INFO] Filling NAs in column mean_h3k4me3 with 0
2025-05-21 17:00:37,061 [INFO] Filling NAs in column sum_h3k4me3 with 0
2025-05-21 17:00:37,067 [INFO] Filling NAs in column cov_epdnew with 0
2025-05-21 17:00:37,073 [INFO] Filling NAs in column ma

In [9]:
# Compare the current postive and negative instances with the ones before data cleaning
final_pos_records = df[df['y'] == 1].shape[0]
final_neg_records = df[df['y'] == 0].shape[0]
logger.info(f"Positive records before data cleaning: {init_pos_records}, after: {final_pos_records}")
logger.info(f"Negative records before data cleaning: {init_neg_records}, after: {final_neg_records}")
# Log percentage of positive and negative records lost
pos_records_lost = init_pos_records - final_pos_records
neg_records_lost = init_neg_records - final_neg_records
if init_pos_records > 0:
    logger.info(f"Percentage of positive records lost: {pos_records_lost / init_pos_records * 100:.2f}%")
if init_neg_records > 0:
    logger.info(f"Percentage of negative records lost: {neg_records_lost / init_neg_records * 100:.2f}%")

2025-05-21 17:00:37,256 [INFO] Positive records before data cleaning: 5455, after: 5388
2025-05-21 17:00:37,258 [INFO] Negative records before data cleaning: 98447, after: 77631
2025-05-21 17:00:37,259 [INFO] Percentage of positive records lost: 1.23%
2025-05-21 17:00:37,259 [INFO] Percentage of negative records lost: 21.14%


In [10]:
X = df.drop(columns=[label])
y = df[label]

In [11]:
# Train-test-validation split
# 70% train, 15% validation, 15% test

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=99)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=99)

logger.info(f"Train set shape: {X_train.shape}, {y_train.shape}")
logger.info(f"Validation set shape: {X_val.shape}, {y_val.shape}")
logger.info(f"Test set shape: {X_test.shape}, {y_test.shape}")
# Check if the target variable is balanced
logger.info(f"Train set target variable distribution: {y_train.value_counts(normalize=True)}")
logger.info(f"Validation set target variable distribution: {y_val.value_counts(normalize=True)}")
logger.info(f"Test set target variable distribution: {y_test.value_counts(normalize=True)}")

2025-05-21 17:00:37,433 [INFO] Train set shape: (58113, 40), (58113,)
2025-05-21 17:00:37,436 [INFO] Validation set shape: (12453, 40), (12453,)
2025-05-21 17:00:37,438 [INFO] Test set shape: (12453, 40), (12453,)
2025-05-21 17:00:37,443 [INFO] Train set target variable distribution: y
False    0.935092
True     0.064908
Name: proportion, dtype: float64
2025-05-21 17:00:37,447 [INFO] Validation set target variable distribution: y
False    0.935116
True     0.064884
Name: proportion, dtype: float64
2025-05-21 17:00:37,452 [INFO] Test set target variable distribution: y
False    0.935116
True     0.064884
Name: proportion, dtype: float64


# Evaluate which model algorithms should be considered for further testing

In [12]:
# from lazypredict.Supervised import LazyClassifier

# # Split the data into Train, Test and Validation sets
# # 70% for training, 15% for testing, 15% for validation
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
# logger.info(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}, Validation set shape: {X_val.shape}")

In [13]:
# df_lz = LazyClassifier(predictions=True)
# # Fit the model
# models = df_lz.fit(X_train, X_test, y_train, y_test)
# # Get the best model
# best_model = models[0].iloc[0]
# logger.info(f"Best model: {best_model['Model']}")

# Imbalanced training set
Since there are a much less positive instances (lncRNAs) than negative instances (protein coding) we can
1. Oversample the minority class -- lncRNAs
2. Undersample the majority class -- protein coding
3. Use imbalanced-resistant machine-learning algorithms (Random Forests)

## 1. Use SMOTE to oversample lncRNA instances

In [14]:
# Perform oversampling of the minority class
smote = SMOTE(random_state=99)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
logger.info(f"Resampled dataset shape: {X_smote.shape}, {y_smote.shape}")
# Split the data into training and testing sets

2025-05-21 17:00:37,758 [INFO] Resampled dataset shape: (108682, 40), (108682,)


In [15]:
# Train a Random Forest classifier
output_prefix = 'rf_model_smote_redux'

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)
rf.fit(X_smote, y_smote)
logger.info("Random Forest classifier trained.")
# Save the model
joblib.dump(rf, f'/home/chlab/flync/src/train/{output_prefix}.joblib')


2025-05-21 17:00:44,916 [INFO] Random Forest classifier trained.


['/home/chlab/flync/src/train/rf_model_smote_redux.joblib']

In [16]:
evaluate_model(rf, output_prefix, X_smote, y_smote, X_test, y_test, X_val, y_val, folds=5)

2025-05-21 17:01:09,867 [INFO] Model metrics saved to rf_model_smote_redux_cv_metrics.txt
2025-05-21 17:01:10,005 [INFO] Model metrics saved to rf_model_smote_redux_test_metrics.txt
2025-05-21 17:01:10,162 [INFO] Model metrics saved to rf_model_smote_redux_val_metrics.txt
2025-05-21 17:01:10,164 [INFO] Model metrics saved to rf_model_smote_redux_final_metrics.txt


In [17]:
output_prefix = 'xgb_model_smote_redux'

# train an XGBoost classifier
from xgboost import XGBClassifier
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)

xgb.fit(X_smote, y_smote)
logger.info("XGBoost classifier trained.")
# Save the model
joblib.dump(xgb, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

2025-05-21 17:01:14,373 [INFO] XGBoost classifier trained.


['/home/chlab/flync/src/train/xgb_model_smote_redux.joblib']

In [18]:
evaluate_model(xgb, output_prefix, X_smote, y_smote, X_test, y_test, X_val, y_val, folds=5)

2025-05-21 17:01:27,837 [INFO] Model metrics saved to xgb_model_smote_redux_cv_metrics.txt
2025-05-21 17:01:27,881 [INFO] Model metrics saved to xgb_model_smote_redux_test_metrics.txt
2025-05-21 17:01:27,923 [INFO] Model metrics saved to xgb_model_smote_redux_val_metrics.txt
2025-05-21 17:01:27,925 [INFO] Model metrics saved to xgb_model_smote_redux_final_metrics.txt


## 2. Oversampling and Undersample

In [19]:
# Use SMOTETomek to balance the dataset
smote_tomek = SMOTETomek(random_state=99)
X_smote_tomek, y_smote_tomek = smote_tomek.fit_resample(X_train, y_train)
logger.info(f"Resampled dataset shape: {X_smote_tomek.shape}, {y_smote_tomek.shape}")

2025-05-21 17:01:34,734 [INFO] Resampled dataset shape: (107446, 40), (107446,)


In [20]:
output_prefix = 'rf_model_smote_tomek_redux'
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)
rf.fit(X_smote_tomek, y_smote_tomek)
logger.info("Random Forest classifier trained.")
# Save the model
joblib.dump(rf, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

2025-05-21 17:01:41,304 [INFO] Random Forest classifier trained.


['/home/chlab/flync/src/train/rf_model_smote_tomek_redux.joblib']

In [21]:
evaluate_model(rf, output_prefix, X_smote_tomek, y_smote_tomek, X_test, y_test, X_val, y_val, folds=5)

2025-05-21 17:02:03,048 [INFO] Model metrics saved to rf_model_smote_tomek_redux_cv_metrics.txt
2025-05-21 17:02:03,196 [INFO] Model metrics saved to rf_model_smote_tomek_redux_test_metrics.txt
2025-05-21 17:02:03,342 [INFO] Model metrics saved to rf_model_smote_tomek_redux_val_metrics.txt
2025-05-21 17:02:03,344 [INFO] Model metrics saved to rf_model_smote_tomek_redux_final_metrics.txt


In [22]:
output_prefix = 'xgb_model_smote_tomek_redux'
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    n_jobs=16
)
xgb.fit(X_smote_tomek, y_smote_tomek)
logger.info("XGBoost classifier trained.")
# Save the model
joblib.dump(xgb, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

2025-05-21 17:02:08,058 [INFO] XGBoost classifier trained.


['/home/chlab/flync/src/train/xgb_model_smote_tomek_redux.joblib']

In [23]:
evaluate_model(rf, output_prefix, X_smote_tomek, y_smote_tomek, X_test, y_test, X_val, y_val, folds=5)

2025-05-21 17:02:31,879 [INFO] Model metrics saved to xgb_model_smote_tomek_redux_cv_metrics.txt
2025-05-21 17:02:32,026 [INFO] Model metrics saved to xgb_model_smote_tomek_redux_test_metrics.txt
2025-05-21 17:02:32,175 [INFO] Model metrics saved to xgb_model_smote_tomek_redux_val_metrics.txt
2025-05-21 17:02:32,178 [INFO] Model metrics saved to xgb_model_smote_tomek_redux_final_metrics.txt


## 3. Do not change the data and use class wieghts to train the model

In [24]:
output_prefix = 'rf_model_redux'

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    class_weight='balanced',
    n_jobs=16
)
rf.fit(X_train, y_train)
logger.info("Random Forest classifier trained.")
# Save the model
joblib.dump(rf, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

2025-05-21 17:02:34,710 [INFO] Random Forest classifier trained.


['/home/chlab/flync/src/train/rf_model_redux.joblib']

In [25]:
evaluate_model(rf, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

2025-05-21 17:02:44,510 [INFO] Model metrics saved to rf_model_redux_cv_metrics.txt
2025-05-21 17:02:44,681 [INFO] Model metrics saved to rf_model_redux_test_metrics.txt
2025-05-21 17:02:44,834 [INFO] Model metrics saved to rf_model_redux_val_metrics.txt
2025-05-21 17:02:44,837 [INFO] Model metrics saved to rf_model_redux_final_metrics.txt


In [26]:
output_prefix = 'xgb_model_redux'

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
logger.info(f"Scale pos weight: {scale_pos_weight}")

xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=99,
    scale_pos_weight=scale_pos_weight,
    n_jobs=16
)
xgb.fit(X_train, y_train)
logger.info("XGBoost classifier trained.")
# Save the model
joblib.dump(xgb, f'/home/chlab/flync/src/train/{output_prefix}.joblib')

2025-05-21 17:02:44,849 [INFO] Scale pos weight: 14.406415694591729
2025-05-21 17:02:47,673 [INFO] XGBoost classifier trained.


['/home/chlab/flync/src/train/xgb_model_redux.joblib']

In [27]:
evaluate_model(xgb, output_prefix, X_train, y_train, X_test, y_test, X_val, y_val, folds=5)

2025-05-21 17:03:02,271 [INFO] Model metrics saved to xgb_model_redux_cv_metrics.txt
2025-05-21 17:03:02,340 [INFO] Model metrics saved to xgb_model_redux_test_metrics.txt
2025-05-21 17:03:02,425 [INFO] Model metrics saved to xgb_model_redux_val_metrics.txt
2025-05-21 17:03:02,427 [INFO] Model metrics saved to xgb_model_redux_final_metrics.txt
