In [6]:
import pandas as pd
import numpy as np
from pathlib import Path
import peptides
from pandarallel import pandarallel
import argparse

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
numeric_features = [
    "aliphatic_index",
    "boman",
    "charge",
    "descriptors",  # ontains all descriptors so do not need physical_descriptors
    "frequencies",
    "hydrophobic_moment",
    "hydrophobicity",
    "instability_index",
    "isoelectric_point",
    "mass_shift",
    # 'membrane_position_profile',  # May be useful, but may need to be parameterized ex: T, S
    "molecular_weight",
    "mz",
    #  'structural_class',  # May be useful, is the predicted structural class ex: alpha
]

vector_features = [
    "hydrophobic_moment_profile",
    "hydrophobicity_profile",
    "linker_preference_profile",
]

In [4]:
def compute_peptides(seq: str, vector: bool = False):

    """
    Get descriptors computed by the peptides package for input sequence.

    Parameters:
        seq (str): The protein sequence.
        vector (bool): Whether vector descriptors should be returned. Default False.
    """
    
    features = {}
    pep = peptides.Peptide(seq)
    if vector:
        pep_features = numeric_features + vector_features
    else:
        pep_features = numeric_features
    for i in pep_features:
        if i != "descriptors" and i != "frequencies":
            features[i] = getattr(pep, i)()
        elif i == "descriptors":
            features.update(getattr(pep, i)())
        elif i == "frequencies":
            features.update({k + "_frequency": v for k, v in getattr(pep, i)().items()})
    if vector:
        features["hydrophobicity_profile"] = list(features["hydrophobicity_profile"])
        features["hydrophobic_moment_profile"] = list(
            features["hydrophobic_moment_profile"]
        )
    features["sequence"] = seq
    return features

In [7]:
# Define input and output files
csv_file_path = "../source_data/adaptyv_biolm_egfr_data.csv"
out_file_path = "peptides.csv"
print(f'Input CSV file: {csv_file_path}')

# Load data
df = pd.read_csv(csv_file_path)
# Compute peptides. Expects sequences to be in sequence column
peptides_features = pd.DataFrame(df["sequence"].parallel_apply(compute_peptides).tolist())
# Save here
peptides_features.to_csv(f"{out_file_path}", index=False)
print(f'Peptides saved to file: {out_file_path}')

Input CSV file: ../source_data/adaptyv_biolm_egfr_data.csv


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=35), Label(value='0 / 35'))), HBox…

Peptides saved to file: peptides.csv
