# 1. Loading the PETase dataset 

In [None]:
#we will now load "masterdb.csv" found under data
import pandas as pd 
import os 
path = "data/masterdb.tsv"
df = pd.read_csv(path,sep="\t")
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("?", "")
)
df2 = pd.DataFrame()
df2["id"]=df["name"]
df2["sequence"] = df["protein_sequence"].astype(str)
print(df2)

def dftofasta(df,outfile):
    with open(outfile,"w") as f:
        for index,row in df.iterrows():
            f.write(f">{row['id']}\n")
            f.write(f"{row['sequence']}\n")
    return outfile 

#dftofasta(df2,"data/masterdb.fasta")

# 2. Loading external stability and expression datasets

**NESG Solubility** 
(https://loschmidt.chemi.muni.cz/soluprot/?page=download)
* 10k proteins
* Labels: exp, sol, uniprot id or local ID 
* Units: integer 

**Soluprot Solubility**
(https://loschmidt.chemi.muni.cz/soluprot/?page=download)
* 11k training, 3k test
* Label: solubility, number IDs with no conversion map (has seq)
* Unit: 0/1

**Price Solubility**
(https://pmc.ncbi.nlm.nih.gov/articles/PMC3372292/)
* 7k proteins 
* Label: usability. uniprot id
* Unit: 0/1

**PSI Solubility** 
(https://academic.oup.com/bioinformatics/article/36/18/4691/5860015?login=false)
* 11k proteins
* Label: solubility, Aa0000 ID scheme (has seq)
* Unit: 0/1

**Meltome Stability** 
(https://meltomeatlas.proteomics.wzw.tum.de/master_meltomeatlasapp/)
* 1M variants 
* Label: temperature, meltpoint, fold_change, uniprot id 

**FireprotDB Stability** 
(https://loschmidt.chemi.muni.cz/fireprotdb/)
* 53k variants
* Label: ddG, dTm, pH, Tm, mutation_effect, uniprot id 

**ThermomutDB Stability**
(https://biosig.lab.uq.edu.au/thermomutdb/downloads)
* 12k variants
* Label: pH, ddG, temperature, dTm, uniprot/pdb id 

**CAFA** 
(https://www.kaggle.com/competitions/cafa-5-protein-function-prediction/code)
* 142k variants

**Novozyme**
(https://www.kaggle.com/code/jinyuansun/eda-and-finetune-esm)
* 31k variants

**Protsol Solubility**
(https://huggingface.co/datasets/AI4Protein/ProtSolM)
* 71k proteins
* Label: solubility, no ID but has sequence
* Unit: 0/1 

In [None]:
#LOADING ALL BENCHMARK DATASETS AND MERGE IN ONE
import pandas as pd
import json
import numpy as np
from pathlib import Path
from Bio.SeqIO.FastaIO import SimpleFastaParser

PSI_PATH      = "data/temp/sol_benchmark/PSI_Biology_solubility_trainset.csv"
psi_detail_path = "data/temp/sol_benchmark/PSI_all_data_esol.tab"
#note that 1988/11000 sequences matched to a proteome gene, with moist being lower than 50% similarity, so we drop PSI 
NESG_PATH     = "data/temp/sol_benchmark/nesg/nesg.csv"
nesg_fasta_path = "data/temp/sol_benchmark/nesg/nesg.fasta"
PRICE_PATH    = "data/temp/sol_benchmark/Price_usability_trainset.csv"
price_fasta=
soluprot_train_path = "data/temp/sol_benchmark/soluprot_data/training_set.csv"
soluprot_test_path = "data/temp/sol_benchmark/soluprot_data/test_set.csv" 
meltome_path = "data/temp/stab_benchmark/meltome_cross-species.csv"
fireprot_path = "data/temp/stab_benchmark/fireprotdb_results_stability.csv"
thermomutdb_path = "data/temp/stab_benchmark/thermomutdb.json"
protsol_train_path = "data/temp/protsolm_data/protsolm_train.csv"
protsol_test_path = "data/temp/protsolm_data/protsolm_test.csv"


def read_fasta_dict(path: str):
    seqs = {}
    with open(path) as fh:
        for header, seq in SimpleFastaParser(fh):
            sid = header.split()[0].strip()
            seqs[sid] = seq.strip()
    return seqs

def load_nesg(csv_path: str, fasta_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)  # uses CSV header row directly: id, exp, sol
    seqs = read_fasta_dict(fasta_path)
    #"sid" "usability" "fasta" 
    df["sequence"] = df["id"].map(seqs)
    return df 

def load_psi(csv_path: str,psi_detail_path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    psi_all = pd.read_csv(psi_detail_path, sep="\t")
    df = df.merge(psi_all, on="sid", how="left")
    df["sequence"] = df["fasta"]
    return df

def load_price(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    df["sequence"] = df["fasta"]
    # "sid" "usability" "fasta" 
    return df

def load_soluprot(train_csv: str, test_csv: str) -> pd.DataFrame:
    test_fasta = read_fasta_dict("data/temp/sol_benchmark/soluprot_data/test_set.fasta")
    train_fasta = read_fasta_dict("data/temp/sol_benchmark/soluprot_data/training_set.fasta")
    fasta = {**train_fasta, **test_fasta}
    df1 = pd.read_csv(train_csv)
    df2 = pd.read_csv(test_csv)
    df = pd.concat([df1, df2], ignore_index=True)
    df["sequence"] = df["sid"].astype(str).map(fasta)    # "sid" "solubility"
    csv_ids = set(df["sid"].astype(str))
    fasta_ids = set(fasta.keys())

    return df

def load_meltome(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    df["sequence"] = None
    # "Protein_ID" "gene_name" "fold_change" "meltPoint" "temperature" "sequence"
    return df

def load_fireprot(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    # "uniprot_id" "pdb_id" "muutation" "ddG" "dTm" "pH" "tm" "mutation_effect" "sequence"
    return df 

def load_thermomut(json_path: str) -> pd.DataFrame:
    with open(json_path) as fh:
        data = json.load(fh)
    df = pd.DataFrame(data)
    # "uniprot" "ph" "ddg" "temperature" "dtm" "PDB_wild" "pdb_mutant" "mutation_code"
    return df

def load_protsolm(train_csv: str, test_csv: str) -> pd.DataFrame:
    df1 = pd.read_csv(train_csv)
    df2 = pd.read_csv(test_csv)
    df = pd.concat([df1, df2], ignore_index=True)
    # "aa_seq" "detail"
    return df

In [39]:
merged = pd.concat([
    load_nesg(NESG_PATH, nesg_fasta_path),
    load_price(PRICE_PATH),
    load_soluprot(soluprot_train_path, soluprot_test_path),
    load_meltome(meltome_path),
    load_fireprot(fireprot_path),
    load_thermomut(thermomutdb_path),
    load_protsolm(protsol_train_path, protsol_test_path)
], ignore_index=True)

print(merged, merged.shape)

  df = pd.read_csv(csv_path)
  df = pd.read_csv(csv_path)


              id  exp  sol                                           sequence  \
0        AR3338B  5.0  0.0  MSVHKLTDLRDNSTNWKINVKILSIWNHPPNSHGEITTMILHDDKN...   
1        AR3347A  0.0  0.0  IPFDYIVEKTVSTGVLVDVIGALLEVGNLTEDYRGLKLPFKIMDQY...   
2        AR3353C  4.0  3.0  MEEERRDDYKFLRIQDAFKALHLHVNLIGVIVELGFSNGSDCSCTL...   
3        AR3354C  0.0  0.0  ALLRRFIGQKVRTVIQVTGSEIGSVVGKSTDDLQIVVRGSSPPSPL...   
4        AR3358A  5.0  2.0  MAASFAFLRDVRPYKTSWRVQVKVLHSWCQYTNMTGETLKLVLVNS...   
...          ...  ...  ...                                                ...   
1281497      NaN  NaN  NaN                                                NaN   
1281498      NaN  NaN  NaN                                                NaN   
1281499      NaN  NaN  NaN                                                NaN   
1281500      NaN  NaN  NaN                                                NaN   
1281501      NaN  NaN  NaN                                                NaN   

         sid  Usability|0=N

# 3. The activity label of the PETase dataset 

**Docking** 

**PET catalysis rate** 

**PET-specific biophysical features** 

**Computational chemistry of PET and using it as a feature**

# 4. Fine-tuning esm2 model

# 5. Fine Tuning esm3 model

# 6. Graph Neural Network