In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ML Pipeline
## Set up training and test sets

In [3]:
# Load embedded data from dataset_embedded.parquet
df_embedded = pd.read_parquet('Embeddings/dataset_embedded.parquet')
print(df_embedded.shape)
# Print the number of NaN values in the dataset
print("Checking for NaN values in the dataset, if any:")
print(df_embedded.isnull().sum().sum())
df_embedded.head()

(1095027, 2052)
Checking for NaN values in the dataset, if any:
0


Unnamed: 0_level_0,UniProt_ID,pubchem_cid,kiba_score,kiba_score_estimated,prot_0,prot_1,prot_2,prot_3,prot_4,prot_5,...,mol_1014,mol_1015,mol_1016,mol_1017,mol_1018,mol_1019,mol_1020,mol_1021,mol_1022,mol_1023
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A0A0B4J268,7428.0,21400.0,1,-0.931152,-0.529297,-0.456055,1.015625,-0.727051,-0.509277,...,0,0,0,0,0,0,0,0,0,0
1,A0A0B4J268,65303.0,21300.0,1,-0.931152,-0.529297,-0.456055,1.015625,-0.727051,-0.509277,...,0,0,0,0,0,0,0,0,0,0
2,A0A0B4J268,96506.0,21400.0,1,-0.931152,-0.529297,-0.456055,1.015625,-0.727051,-0.509277,...,0,0,0,0,0,0,0,0,0,0
3,A0A0B4J268,174326.0,21300.0,1,-0.931152,-0.529297,-0.456055,1.015625,-0.727051,-0.509277,...,0,0,0,0,0,0,0,0,0,0
4,A0A0B4J268,225906.0,7910.0,1,-0.931152,-0.529297,-0.456055,1.015625,-0.727051,-0.509277,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Define the target variable and the features
X = df_embedded.drop(columns=['UniProt_ID', 'pubchem_cid', 'kiba_score'])  # The embeddings (protein and molecule features)
y = df_embedded['kiba_score']  # Target variable (KIBA score)

# Clean up memory
import gc
del df_embedded
gc.collect()

# Print the size of X and y
print("Size of X:", X.shape)    
print("Size of y:", y.shape)

Size of X: (1095027, 2049)
Size of y: (1095027,)


In [5]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Take a subset of the data for faster training (if needed)
# batch_size = 100000
# X_subset = X.iloc[:batch_size]
# y_subset = y.iloc[:batch_size]

# X_subset = X.sample(frac=0.75, random_state=42)
# y_subset = y.loc[X_subset.index]

X_subset = X
y_subset = y

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

# Convert features and target to float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

# Prepare the data for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set the parameters for the model
params = {
    'objective': 'regression',
    'metric': 'l2'} # You can use 'l2' for mean squared error

# Train the model with early stopping
model = lgb.train(params,
                  train_data,
                  valid_sets=[test_data],
                  num_boost_round=1000,
                  callbacks=[
                    lgb.early_stopping(stopping_rounds=50),
                    lgb.log_evaluation(1)
                    ]
                )

# Get the best number of estimators (iterations) based on early stopping
print(f"Best number of estimators: {model.best_iteration}")

# Save the best model to a file
model.save_model('Models/best_lgbm_model_full.txt')

# Predict on the test set using the best model
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Evaluate the performance using Mean Squared Error and R^2 Score
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 22.584805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263041
[LightGBM] [Info] Number of data points in the train set: 876021, number of used features: 2049
[LightGBM] [Info] Start training from score 25475.382141
[1]	valid_0's l2: 4.70008e+11
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l2: 4.67764e+11
[3]	valid_0's l2: 4.70589e+11
[4]	valid_0's l2: 4.75537e+11
[5]	valid_0's l2: 4.81011e+11
[6]	valid_0's l2: 4.88611e+11
[7]	valid_0's l2: 4.95686e+11
[8]	valid_0's l2: 5.04022e+11
[9]	valid_0's l2: 4.90263e+11
[10]	valid_0's l2: 4.85651e+11
[11]	valid_0's l2: 4.93585e+11
[12]	valid_0's l2: 4.85776e+11
[13]	valid_0's l2: 4.76867e+11
[14]	valid_0's l2: 4.84296e+11
[15]	valid_0's l2: 4.75506e+11
[16]	valid_0's l2: 4.7124e+11
[17]	valid_0's l2: 4.7796e+11
[18]	valid_0's l2: 4.7536e+11
[19]	valid_0's l2: 4.71235e+11
[20]	valid