# Notebook for downloading and preparing data
This notebook downloads labeled datasets and prepares them for training. The code is mostly copied from https://github.com/RSchmirler/data-repo_plm-finetune-eval with small adaptations where necessary. 

In [96]:
# Define the function to download and prepare data
import requests
import zipfile
from io import BytesIO
import pandas as pd

def download_data(url, file_name):
    """Download and prepare data from a zipped CSV file."""
    # Download the zip file
    response = requests.get(url)
    zip_file = zipfile.ZipFile(BytesIO(response.content))
    with zip_file.open(file_name) as file:
        df = pd.read_csv(file)
    # Rename 'target' column to 'label'
    df = df.rename(columns={"target":"label"})

    # Split the data into train, validation, and test sets
    test = df[df.set=="test"]
    train_valid_df = df[df.set!="test"].reset_index(drop=True)

    train=train_valid_df[train_valid_df.validation!=True].reset_index(drop=True)
    valid=train_valid_df[train_valid_df.validation==True].reset_index(drop=True)

    keep_columns = ["sequence","label"]
    return train[keep_columns], valid[keep_columns], test[keep_columns]

# Sequence Regression Example Data Preparation 
GB1 dataset from https://github.com/J-SNACKKB/FLIP

"The GB1 "four" variations set stems from a 2016 publication in which mutations at four sites (V39, D40, G41 and V54) were probed against a binding assay. The full WT GB1 sequence was never included in the dataset, so it was inferred from side chain A of PDB's 5LDE.

`three_vs_rest`: `train` is wild type, all single, double & triple mutations, `test` is everything else."

In [97]:
GB1_URL = 'https://github.com/J-SNACKKB/FLIP/raw/main/splits/gb1/splits.zip'
GB1_FILENAME = 'splits/three_vs_rest.csv'
GB1_train_df, GB1_valid_df, GB1_test_df = download_data(GB1_URL, GB1_FILENAME)

# Print dataset info
print("train dataset size", GB1_train_df.size)
print("validation dataset size:", GB1_valid_df.size)
print("test dataset size:", GB1_test_df.size)

train dataset size 5382
validation dataset size: 598
test dataset size: 11486


In [109]:
GB1_train_df.head()

Unnamed: 0,sequence,label
0,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYD...,1.0
1,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGIDGEWTYD...,1.445905
2,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGLDGEWTYD...,1.690164
3,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGMDGEWTYD...,1.17055
4,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVAGEWTYD...,2.401243


In [98]:
import os
os.makedirs("training_data/gb1", exist_ok=True)
GB1_train_df.to_csv("training_data/gb1/gb1_seq_regression_train.csv",index=False)
GB1_valid_df.to_csv("training_data/gb1/gb1_seq_regression_validation.csv",index=False)
GB1_test_df.to_csv("training_data/gb1/gb1_seq_regression_test.csv",index=False)

# Sequence Classification Example Data Preparation

Subcellular location dataset from https://github.com/J-SNACKKB/FLIP/tree/main/splits/scl

"
The six SCL (SubCellularLocation) splits stems from a 2021 publication (based on a 2017 publication) and a 2022 publicaiton which aim at predicting protein subcellular location.

The possible subcellular localizations (in the splits, assigned to TARGET) are Cytoplasm, Nucleus, Cell membrane, Mitochondrion, Endoplasmic reticulum, Lysosome/Vacuole, Golgi apparatus, Peroxisome, Extracellular and Plastid.
"



In [99]:
# Download the zip file from GitHub
SCL_URL = 'https://github.com/J-SNACKKB/FLIP/raw/main/splits/scl/splits.zip'
SCL_FILENAME = 'splits/mixed_soft.csv'
SCL_train_df, SCL_valid_df, SCL_test_df = download_data(SCL_URL, SCL_FILENAME)

In [100]:
# Print dataset size
print("train dataset size", SCL_train_df.size)
print("validation dataset size:", SCL_valid_df.size)
print("test dataset size:", SCL_test_df.size)

# Print label distribution
# Create a DataFrame with label distributions for each split
label_dist = pd.DataFrame({
    'train': SCL_train_df['label'].value_counts().sort_index(),
    'validation': SCL_valid_df['label'].value_counts().sort_index(),
    'test': SCL_test_df['label'].value_counts().sort_index()
}).fillna(0).astype(int)

print(label_dist)


train dataset size 19006
validation dataset size: 3356
test dataset size: 5536
                       train  validation  test
label                                         
Cell membrane            906         161   273
Cytoplasm               1862         299   505
Endoplasmic reticulum    594          95   173
Extracellular           1322         243   393
Golgi apparatus          238          48    70
Lysosome/Vacuole         214          42    64
Mitochondrion           1003         200   302
Nucleus                 2752         476   806
Peroxisome               101          23    30
Plastid                  511          91   152


In [101]:
# Save to csv files
os.makedirs("training_data/scl", exist_ok=True)
SCL_train_df.to_csv("training_data/scl/scl_seq_classification_train.csv",index=False)
SCL_valid_df.to_csv("training_data/scl/scl_seq_classification_validation.csv",index=False)
SCL_test_df.to_csv("training_data/scl/scl_seq_classification_test.csv",index=False)

# Token Regression Example Data Preparation 

AAV fitness dataset from https://github.com/J-SNACKKB/FLIP/tree/main/splits/aav
We use two_vs_many split. 

"
The original sequence from the aav study is UniProt P03135. A copy of the wildtype sequence can be found in this folder as P03135.fasta On the reference sequence, mutations where introduced starting from region `[561, 588]`, which reflects the AA sequence: `DEEEIRTTNPVATEQYGSVSTNLQRGNR`.

`two_vs_many`: train if in "des" with levenshtein_distance <= 2 , test if in "des" with levenshtein_distance > 2
"

In [102]:
# Download the zip file from GitHub
AAV_URL = 'https://github.com/J-SNACKKB/FLIP/raw/main/splits/aav/splits.zip'
AAV_FILENAME = 'splits/two_vs_many.csv'
AAV_train_df, AAV_valid_df, AAV_test_df = download_data(AAV_URL, AAV_FILENAME)

In [103]:
# Print dataset info
print("train dataset size", AAV_train_df.size)
print("validation dataset size:", AAV_valid_df.size)
print("test dataset size:", AAV_test_df.size)
print(AAV_train_df.head())

train dataset size 460104
validation dataset size: 6362
test dataset size: 101552
                                            sequence     label
0  MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV... -6.824780
1  MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV... -6.500402
2  MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...  0.900998
3  MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV... -4.843517
4  MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV... -6.566587


In [104]:
# Save to csv files
os.makedirs("training_data/aav", exist_ok=True)
AAV_train_df.to_csv("training_data/aav/aav_token_regression_train.csv",index=False)
AAV_valid_df.to_csv("training_data/aav/aav_token_regression_validation.csv",index=False)
AAV_test_df.to_csv("training_data/aav/aav_token_regression_test.csv",index=False)

# Token Classification Data Preparation Example

Secondary structure prediction from https://github.com/J-SNACKKB/FLIP/tree/main/splits/secondary_structure

"
The secondary structure split stem from three different publications, cited at the end, which aims at predicting the conservation score of the residues of a protein sequence.

This is a well-known dataset and it is used to validate the behavior of code and models. Only provided a sampled split for this purpose.

There are 9712 proteins for training, 1080 proteins for validation and 648 proteins for testing.

`sampled`: Randomly split sequences into train/test with 95/5% probability.
"


In [105]:
from Bio import SeqIO
import tempfile

# Download the zip file from GitHub
SS_URL = 'https://github.com/J-SNACKKB/FLIP/raw/main/splits/secondary_structure/splits.zip'

# The structure of this dataset is different, so we handle it separately. 
# Adapted from RSchmirler et al. 

response = requests.get(SS_URL)
zip_file = zipfile.ZipFile(BytesIO(response.content))

# Extract the fasta file to a temporary directory
# Sequence File
with tempfile.TemporaryDirectory() as temp_dir:
    zip_file.extract('splits/sequences.fasta', temp_dir)

    # Load the fasta files
    fasta_file = open(temp_dir + '/splits/sequences.fasta')
    
    # Load FASTA file using Biopython
    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append([record.name, str(record.seq)])

    # Create dataframe
    df = pd.DataFrame(sequences, columns=["name", "sequence"])

# Mask File
with tempfile.TemporaryDirectory() as temp_dir:
    zip_file.extract('splits/mask.fasta', temp_dir)

    # Load the fasta files
    fasta_file = open(temp_dir + '/splits/mask.fasta')
    
    # Load FASTA file using Biopython
    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append([str(record.seq)])

    # Add to dataframe
    df = pd.concat([df, pd.DataFrame(sequences, columns=["mask"])], axis=1) 
    
# Label File
with tempfile.TemporaryDirectory() as temp_dir:
    zip_file.extract('splits/sampled.fasta', temp_dir)

    # Load the fasta files
    fasta_file = open(temp_dir + '/splits/sampled.fasta')
    
    # Load FASTA file using Biopython
    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):

        sequences.append([str(record.seq), record.description])

    # Add to dataframe
    df = pd.concat([df, pd.DataFrame(sequences, columns=[ "label", "dataset"])], axis=1)  

df.head()

Unnamed: 0,name,sequence,mask,label,dataset
0,1es5-A,VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVL...,0011111111111111111111111111111111111111111111...,CCCCCCCCCEEEEEECCCCCEEEEECCCCCECCHHHHHHHHHHHHH...,1es5-A SET=train VALIDATION=False
1,2a6h-E,MAEPGIDKLFGMVDSKYRLTVVVAKRAQQLLRHGFKNTVLEPEERP...,0111111111111111111111111111111111111111111111...,CCCCCHHHHHHHCCCHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCC...,2a6h-E SET=train VALIDATION=False
2,5b1a-P,MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIG...,0011111111111111111111111111111111111111111111...,CCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHH...,5b1a-P SET=train VALIDATION=False
3,5ehi-C,GTGSQGETLGEKWKKKLNQLSRKEFDLYKKSGITEVDRTEAKEGLK...,0000001111111111111111111111111111111111111111...,CCCCCCCCHHHHHHHHHHCCCHHHHHHHHHCCCEEEECHHHHHHHC...,5ehi-C SET=train VALIDATION=False
4,5egf-A,HHHHHHAVAKDSTESKSWEPFSLSPIKDPQALHAALCSKNVIPVTS...,0000000000000000011111111111111111111111111111...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHCCCCCCCC...,5egf-A SET=train VALIDATION=False


In [106]:
# Get data split information
df["validation"]=df.dataset.str.split("=").str[2]
df["test"]=df.dataset.apply(lambda s: s.split("=")[1].split(" ")[0]=='test')

# str to bool
df['validation'] = df['validation'].apply(lambda x: x == 'True')

# Extract data split information
df["dataset"]=df.dataset.str.split("=").str[1]
df["dataset"]=df.dataset.str.split(" ").str[0]

# Preprocess mask and label to lists
# C is class 0, E is class 1, H is class 2
df['label'] = df['label'].str.replace("C","0")
df['label'] = df['label'].str.replace("E","1")
df['label'] = df['label'].str.replace("H","2")

# str to integer
df['label'] = df['label'].apply(lambda x: [int(i) for i in x])
df['mask'] = df['mask'].apply(lambda x: [int(i) for i in x])

# Set masked positions to -100 in label for loss calculation
df['label'] = df.apply(lambda row: [-100 if m == 0 else l for l, m in zip(row['label'], row['mask'])], axis=1)

# Split the data into train, validation, and test sets
ss_test_df = df[df.test==True].reset_index(drop=True)[["sequence","label"]]
ss_train_df = df[(df.test==False) & (df.validation==False)].reset_index(drop=True)[["sequence","label"]]
ss_valid_df = df[(df.test==False) & (df.validation==True)].reset_index(drop=True)[["sequence","label"]]


In [107]:
# Print dataset info
print("train dataset size", ss_train_df.size)
print("validation dataset size:", ss_valid_df.size)
print("test dataset size:", ss_test_df.size)

# Show examples
ss_train_df.head(3)

train dataset size 19424
validation dataset size: 2160
test dataset size: 728


Unnamed: 0,sequence,label
0,VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVL...,"[-100, -100, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ..."
1,MAEPGIDKLFGMVDSKYRLTVVVAKRAQQLLRHGFKNTVLEPEERP...,"[-100, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, ..."
2,MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIG...,"[-100, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [108]:
# Save to csv files
os.makedirs("training_data/ss", exist_ok=True)
ss_train_df.to_csv("training_data/ss/ss_token_classification_train.csv",index=False)
ss_valid_df.to_csv("training_data/ss/ss_token_classification_validation.csv",index=False)
ss_test_df.to_csv("training_data/ss/ss_token_classification_test.csv",index=False)