# Comparing model against baseline

In [5]:
# load data
import numpy as np
import pandas as pd
gene_matrix_array = np.load("../../../../../modelling/init_model/gene_matrix_list.npy")
rna_expression_df = pd.read_csv("../../../../../modelling/init_model/rna_expression_list.csv")

#C:\Users\julia\OneDrive\Documents\Honours2024\Honours\Results\slurm_output\init_class_results\mlp\dnam_target\model_vs_baseline.ipynb
#C:\Users\julia\OneDrive\Documents\Honours2024\Honours\modelling\init_model\gene_matrix_list.npy

In [6]:
# separate out features
dnam_features = gene_matrix_array[:, :, 0]
h3k9me3_features = gene_matrix_array[:, :, 1]
h3k27me3_features = gene_matrix_array[:, :, 2]
rna_expression = (rna_expression_df["expression"].values > 0).astype(
    int
)  # Convert expression to binary
expression_values = rna_expression.reshape(-1, 1)  # Shape (58780, 1)

In [7]:
# get y
# Set DNA methylation as target - altering y to become a count of 1s in each 4000 length array

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_histone = np.concatenate(
    (h3k9me3_features, h3k27me3_features), axis=1
)  # Shape (58780, 8000)
rna_expression = (rna_expression_df["expression"].values > 0).astype(int)
X_expression = rna_expression.reshape(-1, 1)  # Shape (58780, 1)
X = np.concatenate((X_histone, X_expression), axis=1)

## y is set to DNAm
y = np.sum(dnam_features, axis=1)  # Shape (58780,)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)    

In [8]:
import numpy as np

y_train_mean = np.mean(y_train)  
y_baseline_pred = np.full_like(y_test, y_train_mean)  
from sklearn.metrics import mean_squared_error

baseline_mse = mean_squared_error(y_test, y_baseline_pred)



In [14]:
def compare_mse(model_mse, baseline_mse):
    print(f"Baseline MSE: {baseline_mse}")
    print(f"Model MSE: {model_mse}")

    percentage_improvement = ((baseline_mse - model_mse) / baseline_mse) * 100

    print("Improvement on baseline: ",percentage_improvement )
    return percentage_improvement

In [15]:
#MLP improvement
mlp_mse = 7425.3827  
compare_mse(mlp_mse, baseline_mse)

Baseline MSE: 10319.948376161663
Model MSE: 7425.3827
Improvement on baseline:  28.04825732314612


28.04825732314612

In [16]:
# LGBM improvement
lgbm_mse = 6581.1190   
compare_mse(lgbm_mse, baseline_mse)

Baseline MSE: 10319.948376161663
Model MSE: 6581.119
Improvement on baseline:  36.2291480526985


36.2291480526985

In [19]:
# XGBoost improvement
xgb_mse = 6689.7580
compare_mse(xgb_mse, baseline_mse)

Baseline MSE: 10319.948376161663
Model MSE: 6689.758
Improvement on baseline:  35.176439298350964


35.176439298350964

In [18]:
# RF improvement
rf_mse = 7097.761
compare_mse(rf_mse, baseline_mse)

Baseline MSE: 10319.948376161663
Model MSE: 7097.761
Improvement on baseline:  31.222902079672053


31.222902079672053