In [4]:
import pandas as pd

train = pd.read_csv("../data/train_dataset.csv")
val = pd.read_csv("../data/validation_dataset.csv")
test = pd.read_csv("../data/test_dataset.csv")
print("Datasets reloaded ✅")


Datasets reloaded ✅


In [5]:
# Amino acid composition
AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY")

def aa_composition(seq):
    counts = {aa: 0 for aa in AMINO_ACIDS}
    for aa in seq:
        if aa in counts:
            counts[aa] += 1
    length = len(seq)
    return {aa: counts[aa] / length for aa in AMINO_ACIDS}

# Hydrophobicity scale
hydropathy = {
    'A': 1.8,  'C': 2.5,  'D': -3.5, 'E': -3.5, 'F': 2.8,
    'G': -0.4, 'H': -3.2, 'I': 4.5,  'K': -3.9, 'L': 3.8,
    'M': 1.9,  'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
    'S': -0.8, 'T': -0.7, 'V': 4.2,  'W': -0.9, 'Y': -1.3
}

def add_physchem_features(df):
    df = df.copy()
    df['seq_length'] = df['sequence'].str.len()
    df['hydrophobicity'] = df['sequence'].apply(lambda s: sum(hydropathy.get(aa,0) for aa in s) / len(s))
    df['aromatic_fraction'] = df['sequence'].apply(lambda s: sum(s.count(aa) for aa in "FWY") / len(s))
    return df

def build_full_features(seq_df, feats_df):
    aa_feats = seq_df['sequence'].apply(aa_composition).apply(pd.Series)
    aa_feats['seq_length'] = feats_df['seq_length']
    aa_feats['hydrophobicity'] = feats_df['hydrophobicity']
    aa_feats['aromatic_fraction'] = feats_df['aromatic_fraction']
    aa_feats['label'] = seq_df['label']
    return aa_feats

# Build features
train_extra = add_physchem_features(train)
val_extra   = add_physchem_features(val)
test_extra  = add_physchem_features(test)

train_full = build_full_features(train, train_extra)
val_full   = build_full_features(val, val_extra)
test_full  = build_full_features(test, test_extra)

print("Feature sets rebuilt ✅")


Feature sets rebuilt ✅


In [6]:
from xgboost import XGBClassifier

# Prepare data again
X_train = train_full.drop(columns=['label'])
y_train = train_full['label'].map({'Soluble':1, 'Insoluble':0})

X_test = test_full.drop(columns=['label'])
y_test = test_full['label'].map({'Soluble':1, 'Insoluble':0})

# Retrain the final model
xgb2 = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train),
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)

xgb2.fit(X_train, y_train)
print("Model retrained successfully ✅")


Model retrained successfully ✅


In [7]:
import joblib
import json
from pathlib import Path

Path("../models").mkdir(parents=True, exist_ok=True)

joblib.dump(xgb2, "../models/xgb2_solubility.joblib")
print("Saved model -> ../models/xgb2_solubility.joblib")

feature_cols = list(X_train.columns)
with open("../models/feature_cols.json", "w") as f:
    json.dump(feature_cols, f)
print("Saved feature columns -> ../models/feature_cols.json")


Saved model -> ../models/xgb2_solubility.joblib
Saved feature columns -> ../models/feature_cols.json
