In [2]:
import re
import pandas as pd
#from sklearn.model_selection import train_test_split

# Load the CSV file into a DataFrame
short = pd.read_csv('../high-charge-density22.csv')
short['Class'] = 1

pos = short[
    (short['Sequence'].str.len() > 6) &
    (short['Sequence'].str.len() < 40)
]

pos = pos.sample(n=min(4526, len(pos)), random_state=40)
pos

Unnamed: 0,Sequence,Class
1602,FWRRYKKVKKYRRWF,1
3413,VDKPPYLPRVRPPRRIYNR,1
596,WWKRWKRIRRIFMMV,1
913,LLCKLKCKLKL,1
2492,KLKAGLAKWKAGLAKLKAGLA,1
...,...,...
575,VQLRCRVCVIRK,1
1509,WRLWRLWRLWRLWRL,1
1381,LARFVLRILKYGFK,1
3620,VAKKLAKLAKKLAKLAL,1


In [3]:
import re
import pandas as pd
#from sklearn.model_selection import train_test_split

# Load the CSV file into a DataFrame
short = pd.read_csv('../neg-high-charge-density22.csv')
short['Class'] = 0
neg = short[
    (short['Sequence'].str.len() > 6) &
    (short['Sequence'].str.len() < 40)
]
neg
# Display the result
neg

neg

Unnamed: 0,Sequence,Class
0,EREREKRFSFFKKNK,0
1,GRKRRQTSLTDFYHSKRRLVFCKRKP,0
2,ARRRRRHASTKLKRRRRRRRHGKKSHK,0
3,ARRRRRSSRPQRRRRRRRHGRRRRGRR,0
4,PRRRTRRASRPVRRRRPRRVSRRRRARRRR,0
...,...,...
4523,KSLSTGKSKSFFVRQTNKS,0
4524,KSKSFFVRQTKKS,0
4525,KSLSTGKSKSFFVRQTKKS,0
4526,FSSEKLKARKEKKSRKQAPY,0


In [4]:
concatenated_df = pd.concat([pos,neg], ignore_index=True)
concatenated_df = concatenated_df.drop_duplicates(subset='Sequence', keep=False)
concatenated_df

Unnamed: 0,Sequence,Class
0,FWRRYKKVKKYRRWF,1
1,VDKPPYLPRVRPPRRIYNR,1
2,WWKRWKRIRRIFMMV,1
3,LLCKLKCKLKL,1
4,KLKAGLAKWKAGLAKLKAGLA,1
...,...,...
9047,KSLSTGKSKSFFVRQTNKS,0
9048,KSKSFFVRQTKKS,0
9049,KSLSTGKSKSFFVRQTKKS,0
9050,FSSEKLKARKEKKSRKQAPY,0


In [5]:
import re
# Clean comma-separated sequences (take the first part)
concatenated_df['Sequence'] = concatenated_df['Sequence'].str.split(',').str[0]

# Remove spaces
concatenated_df['Sequence'] = concatenated_df['Sequence'].str.replace(' ', '', regex=False)

# Remove invalid amino acids
concatenated_df['Sequence'] = concatenated_df['Sequence'].apply(lambda x: re.sub(r'[^ACDEFGHIKLMNPQRSTVWY]', '', x))

#from sklearn.model_selection import train_test_split

# Remove sequences with issues
concatenated_df =concatenated_df[~concatenated_df['Sequence'].str.contains('[, ZBOUXJ ]')]

concatenated_df

Unnamed: 0,Sequence,Class
0,FWRRYKKVKKYRRWF,1
1,VDKPPYLPRVRPPRRIYNR,1
2,WWKRWKRIRRIFMMV,1
3,LLCKLKCKLKL,1
4,KLKAGLAKWKAGLAKLKAGLA,1
...,...,...
9047,KSLSTGKSKSFFVRQTNKS,0
9048,KSKSFFVRQTKKS,0
9049,KSLSTGKSKSFFVRQTKKS,0
9050,FSSEKLKARKEKKSRKQAPY,0


In [6]:
concatenated_df = concatenated_df.reset_index(drop=True)
concatenated_df

Unnamed: 0,Sequence,Class
0,FWRRYKKVKKYRRWF,1
1,VDKPPYLPRVRPPRRIYNR,1
2,WWKRWKRIRRIFMMV,1
3,LLCKLKCKLKL,1
4,KLKAGLAKWKAGLAKLKAGLA,1
...,...,...
9041,KSLSTGKSKSFFVRQTNKS,0
9042,KSKSFFVRQTKKS,0
9043,KSLSTGKSKSFFVRQTKKS,0
9044,FSSEKLKARKEKKSRKQAPY,0


In [None]:
import requests
import os
import time
import pandas as pd

# Configuration
OUTPUT_DIR = "hd/"
SLEEP_TIME = 5  # Seconds between requests
MAX_RETRIES = 3  # Retry attempts for failed requests
START_INDEX = 0  # <-- Change here to resume

# Create the output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_pdb_from_sequence(seq, out_file_path):
    """Get PDB structure from ESM Atlas API for a given amino acid sequence"""
    url = "https://api.esmatlas.com/foldSequence/v1/pdb/"
    headers = {
        "Content-Type": "text/plain"
    }

    # Sanitize sequence
    clean_seq = ''.join(filter(str.isalpha, seq)).upper()

    for attempt in range(MAX_RETRIES):
        try:
            response = requests.post(url, data=clean_seq, headers=headers, timeout=30)
            if response.ok:
                with open(out_file_path, "w") as f:
                    f.write(response.text)
                return True
            else:
                print(f"Attempt {attempt + 1} failed for {clean_seq[:10]}... "
                      f"Status: {response.status_code}, Response: {response.text[:100]}")
                if response.status_code == 429:
                    time.sleep(SLEEP_TIME * (attempt + 2))  # Exponential backoff
        except Exception as e:
            print(f"Attempt {attempt + 1} error: {str(e)}")
            time.sleep(SLEEP_TIME)

    return False

# Initialize PDB_File column if missing
if 'PDB_File' not in concatenated_df.columns:
    concatenated_df['PDB_File'] = [None] * len(concatenated_df)

# Process each sequence from START_INDEX onward
for idx in range(START_INDEX, len(concatenated_df)):
    seq = concatenated_df.at[idx, 'Sequence']
    filename = f"peptide_{idx}.pdb"
    file_path = os.path.join(OUTPUT_DIR, filename)

    print(f"Processing sequence {idx + 1}/{len(concatenated_df)}: {seq[:15]}...")

 )
    if os.path.exists(file_path):
        print(f"Already exists: {file_path}")
        concatenated_df.at[idx, 'PDB_File'] = file_path
        continue

    success = get_pdb_from_sequence(seq, file_path)
    if success:
        print(f"Successfully saved: {file_path}")
        concatenated_df.at[idx, 'PDB_File'] = file_path
    else:
        print(f"Failed to get PDB for index {idx}")
        concatenated_df.at[idx, 'PDB_File'] = None

    time.sleep(SLEEP_TIME)




In [7]:
import os

OUTPUT_DIR = "../hd//"
existing_files = os.listdir(OUTPUT_DIR)

for file in existing_files:
    if file.endswith(".pdb") and file.startswith("peptide_"):
        try:
            idx = int(file.replace("peptide_", "").replace(".pdb", ""))
            file_path = os.path.join(OUTPUT_DIR, file)
            if 0 <= idx < len(concatenated_df):
                concatenated_df.at[idx, 'PDB_File'] = file_path
        except ValueError:
            continue  # In case there's a filename that doesn't match the pattern


concatenated_df.to_csv("peptides_with_pdb_paths_fixed.csv", index=False)
print("✅ PDB_File column updated from existing files.")
concatenated_df

✅ PDB_File column updated from existing files.


Unnamed: 0,Sequence,Class,PDB_File
0,FWRRYKKVKKYRRWF,1,../hd//peptide_0.pdb
1,VDKPPYLPRVRPPRRIYNR,1,../hd//peptide_1.pdb
2,WWKRWKRIRRIFMMV,1,../hd//peptide_2.pdb
3,LLCKLKCKLKL,1,../hd//peptide_3.pdb
4,KLKAGLAKWKAGLAKLKAGLA,1,../hd//peptide_4.pdb
...,...,...,...
9041,KSLSTGKSKSFFVRQTNKS,0,../hd//peptide_9041.pdb
9042,KSKSFFVRQTKKS,0,../hd//peptide_9042.pdb
9043,KSLSTGKSKSFFVRQTKKS,0,../hd//peptide_9043.pdb
9044,FSSEKLKARKEKKSRKQAPY,0,../hd//peptide_9044.pdb


In [8]:
from Bio.PDB import PDBParser, DSSP
import pandas as pd
import numpy as np
import os

# Path to folder containing PDB files
pdb_directory = "../hd/./"  # Adjust if needed

# Initialize DSSP feature columns
dssp_columns = ['Mean_RSA', 'Mean_Phi', 'Mean_Psi']
for col in dssp_columns:
    concatenated_df[col] = np.nan

parser = PDBParser(QUIET=True)

# Loop to calculate RSA, Phi, Psi features
for idx in concatenated_df.index:
    pdb_filename = f"peptide_{idx}.pdb"
    pdb_path = os.path.join(pdb_directory, pdb_filename)

    if not os.path.exists(pdb_path):
        print(f"File not found: {pdb_path}")
        continue

    try:
        structure = parser.get_structure(f"pep_{idx}", pdb_path)
        model = structure[0]
        dssp = DSSP(model, pdb_path)

        rsa_list = []
        phi_list = []
        psi_list = []

        for res in dssp:
            rsa = res[3]
            phi = res[4]
            psi = res[5]

            rsa_list.append(rsa)
            phi_list.append(phi)
            psi_list.append(psi)

        # Compute means
        concatenated_df.at[idx, 'Mean_RSA'] = np.mean(rsa_list) if rsa_list else np.nan
        concatenated_df.at[idx, 'Mean_Phi'] = np.mean(phi_list) if phi_list else np.nan
        concatenated_df.at[idx, 'Mean_Psi'] = np.mean(psi_list) if psi_list else np.nan

    except Exception as e:
        print(f"Error processing index {idx} - file: {pdb_path}: {e}")


In [9]:
from Bio.PDB import PDBParser, DSSP
import os

# Columns to store DSSP secondary structure percentages
helices = []
sheets = []
coils = []

parser = PDBParser()

for idx in concatenated_df.index:
    pdb_file = f"../hd/peptide_{idx}.pdb"
    try:
        structure = parser.get_structure(f"pep_{idx}", pdb_file)
        model = structure[0]
        dssp = DSSP(model, pdb_file)
        
        ss_list = [dssp[key][2] for key in dssp.keys()]
        total = len(ss_list)
        
        helix_count = sum(s in "HGI" for s in ss_list)
        sheet_count = sum(s in "BE" for s in ss_list)
        coil_count  = sum(s in "ST " for s in ss_list)  # includes turn, bend, and loops
        
        helices.append(helix_count / total * 100)
        sheets.append(sheet_count / total * 100)
        coils.append(coil_count / total * 100)
    except Exception as e:
        print(f"Failed to process {pdb_file}: {e}")
        helices.append(None)
        sheets.append(None)
        coils.append(None)

# Add DSSP % columns to DataFrame
concatenated_df['%Helix'] = helices
concatenated_df['%Sheet'] = sheets
concatenated_df['%Coil'] = coils


In [10]:
import mdtraj as md
import pandas as pd
import numpy as np
import os

# Folder containing your PDBs
pdb_folder = '../hd/'

# Initialize columns
concatenated_df['RoG'] = np.nan
concatenated_df['SASA'] = np.nan
concatenated_df['Compactness'] = np.nan

# Loop through each peptide
for idx in concatenated_df.index:
    pdb_path = os.path.join(pdb_folder, f"peptide_{idx}.pdb")

    if not os.path.exists(pdb_path):
        print(f"Missing: {pdb_path}")
        continue

    try:
        traj = md.load(pdb_path)
        
        # Radius of Gyration (RoG)
        rog = md.compute_rg(traj)[0]
        concatenated_df.at[idx, 'RoG'] = rog

        # Solvent Accessible Surface Area (SASA)
        sasa = md.shrake_rupley(traj)[0].sum()
        concatenated_df.at[idx, 'SASA'] = sasa

        # Compactness = Mass / SASA (or optionally 1/SASA per atom)
        n_atoms = traj.n_atoms
        compactness = n_atoms / sasa if sasa > 0 else np.nan
        concatenated_df.at[idx, 'Compactness'] = compactness

    except Exception as e:
        print(f"Error at index {idx}: {e}")




In [12]:
cleaned_df = concatenated_df
cleaned_df

Unnamed: 0,Sequence,Class,PDB_File,Mean_RSA,Mean_Phi,Mean_Psi,%Helix,%Sheet,%Coil,RoG,SASA,Compactness
0,FWRRYKKVKKYRRWF,1,../hd//peptide_0.pdb,0.859582,-52.913333,47.360000,46.666667,0.0,20.000000,1.094001,27.651440,5.822482
1,VDKPPYLPRVRPPRRIYNR,1,../hd//peptide_1.pdb,0.865338,-25.389474,113.342105,0.000000,0.0,0.000000,1.451317,30.585161,5.525555
2,WWKRWKRIRRIFMMV,1,../hd//peptide_2.pdb,0.716046,-42.386667,-8.420000,80.000000,0.0,6.666667,0.808917,22.301994,6.905212
3,LLCKLKCKLKL,1,../hd//peptide_3.pdb,0.829469,-41.363636,32.109091,45.454545,0.0,18.181818,0.697707,16.447903,5.350226
4,KLKAGLAKWKAGLAKLKAGLA,1,../hd//peptide_4.pdb,0.702707,-41.447619,-23.457143,90.476190,0.0,0.000000,0.989981,22.909313,6.547556
...,...,...,...,...,...,...,...,...,...,...,...,...
9041,KSLSTGKSKSFFVRQTNKS,0,../hd//peptide_9041.pdb,0.903197,-33.236842,107.900000,0.000000,0.0,0.000000,1.584088,28.541864,5.220402
9042,KSKSFFVRQTKKS,0,../hd//peptide_9042.pdb,0.899669,-17.700000,103.261538,0.000000,0.0,0.000000,1.071776,20.900116,5.263129
9043,KSLSTGKSKSFFVRQTKKS,0,../hd//peptide_9043.pdb,0.887751,-26.436842,104.094737,0.000000,0.0,0.000000,1.553086,28.279654,5.304167
9044,FSSEKLKARKEKKSRKQAPY,0,../hd//peptide_9044.pdb,0.701268,-43.470000,14.855000,65.000000,0.0,15.000000,1.004550,25.511311,6.624513


In [13]:
import pandas as pd
import peptides



# Define the peptide descriptors you want to calculate
descriptors = [
    'frequencies',
    'aliphatic_index',
    'boman',
    'charge',
    'isoelectric_point',
    'hydrophobic_moment',
    'hydrophobicity',
    'instability_index',
    'mass_shift',
    'molecular_weight',
    'mz',
    'structural_class'
]

# Create a dictionary to store descriptor values
descriptor_values_dict = {}

# Iterate through each descriptor and calculate values for each sequence
for descriptor in descriptors:
    descriptor_values = []
    for sequence in cleaned_df['Sequence']:
        peptide = peptides.Peptide(sequence)
        value = getattr(peptide, descriptor)()
        descriptor_values.append(value)
    descriptor_values_dict[descriptor] = descriptor_values

# Create a new DataFrame using the descriptor values dictionary
descriptors = pd.DataFrame(descriptor_values_dict)

In [14]:
import pandas as pd


# containing dictionaries of amino acid frequencies

# Expand 'frequencies' column into separate columns
amino_acid_frequencies = pd.DataFrame(descriptors['frequencies'].tolist()).fillna(0)

# Combine the expanded DataFrame with the original DataFrame
descriptors_df = pd.concat([descriptors, amino_acid_frequencies], axis=1)

# Drop the original 'frequencies' column
descriptors_df.drop(columns=['frequencies'], inplace=True)

# Display the resulting DataFrame
descriptors_df


Unnamed: 0,aliphatic_index,boman,charge,isoelectric_point,hydrophobic_moment,hydrophobicity,instability_index,mass_shift,molecular_weight,mz,...,T,W,Y,V,O,U,B,Z,J,X
0,19.333333,4.500000,7.995090,12.130605,0.630528,-1.880000,100.013333,48.161032,2247.72584,1124.149951,...,0.000000,0.133333,0.133333,0.066667,0.0,0.0,0.0,0.0,0.0,0.0
1,71.578947,4.098421,4.996418,11.947735,0.606835,-1.436842,55.884211,36.120774,2392.83874,1196.695446,...,0.000000,0.000000,0.105263,0.105263,0.0,0.0,0.0,0.0,0.0,0.0
2,71.333333,2.815333,5.997381,12.984698,0.750688,-0.580000,127.253333,36.120774,2192.76624,1096.621661,...,0.000000,0.200000,0.000000,0.066667,0.0,0.0,0.0,0.0,0.0,0.0
3,177.272727,-0.450909,3.872855,10.348813,0.279752,0.763636,-29.500000,24.080516,1302.78624,708.943280,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,121.428571,-0.348095,5.996214,11.499626,0.357415,0.204762,-6.647619,36.120774,2136.69854,1068.690761,...,0.000000,0.047619,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9041,35.789474,2.925263,4.996801,11.997381,0.345357,-1.015789,15.226316,30.100645,2130.43194,1065.586916,...,0.105263,0.000000,0.000000,0.052632,0.0,0.0,0.0,0.0,0.0,0.0
9042,22.307692,3.494615,4.996801,11.997381,0.390433,-1.300000,24.046154,30.100645,1570.85534,785.956821,...,0.076923,0.000000,0.000000,0.076923,0.0,0.0,0.0,0.0,0.0,0.0
9043,35.789474,2.867895,5.996506,12.054553,0.390433,-1.036842,10.673684,36.120774,2144.50224,1072.612931,...,0.105263,0.000000,0.000000,0.052632,0.0,0.0,0.0,0.0,0.0,0.0
9044,29.500000,4.056000,5.998907,11.106017,0.396581,-1.900000,51.370000,48.161032,2409.81744,1205.189861,...,0.000000,0.000000,0.050000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem


# Create a new DataFrame to store the results
d4 = pd.DataFrame()

def calculate_tpsa(peptide_sequence):
    # Create a molecule object from the peptide sequence
    peptide_molecule = Chem.MolFromSequence(peptide_sequence)

    if peptide_molecule is not None:
        # Calculate TPSA using AllChem method
        tpsa = AllChem.CalcTPSA(peptide_molecule)

        return tpsa
    else:
        print(f"Invalid peptide sequence: {peptide_sequence}")
        return None

# Apply the function to the 'Sequence' column and create a new 'TPSA' column
#new_pos['Sequence'] = cleaned_df['Sequence']
d4['TPSA'] = cleaned_df['Sequence'].apply(calculate_tpsa)

# Display the new DataFrame with TPSA values
d4

Unnamed: 0,TPSA
0,894.44
1,999.54
2,817.73
3,458.40
4,817.23
...,...
9041,980.89
9042,702.51
9043,963.82
9044,1085.96


In [16]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Crippen, Lipinski, Descriptors


# Create a new DataFrame to store the results
d6 = pd.DataFrame()

def calculate_descriptors(peptide_sequence):
    # Create a molecule object from the peptide sequence
    peptide_molecule = Chem.MolFromSequence(peptide_sequence)

    if peptide_molecule is not None:
        # Calculate various molecular descriptors
        heavy_atom_count = Descriptors.HeavyAtomCount(peptide_molecule)
        logp = Crippen.MolLogP(peptide_molecule)
        mol_logp = Crippen.MolLogP(peptide_molecule)
        # MolLogS calculation is not directly available in RDKit, consider using other methods/tools
        fraction_csp3 = Lipinski.FractionCSP3(peptide_molecule)
        bertz_ct = Descriptors.BertzCT(peptide_molecule)

        return heavy_atom_count, logp, mol_logp, fraction_csp3, bertz_ct
    else:
        print(f"Invalid peptide sequence: {peptide_sequence}")
        return None

# Apply the function to the 'Sequence' column and create new columns for descriptors
#new_pos['Sequence'] = cleaned_df['Sequence']
d6[['HeavyAtomCount', 'LogP', 'MolLogP', 'FractionCSP3', 'BertzCT']] = cleaned_df['Sequence'].apply(calculate_descriptors).apply(pd.Series)

# Display the new DataFrame with descriptor values
d6

Unnamed: 0,HeavyAtomCount,LogP,MolLogP,FractionCSP3,BertzCT
0,162.0,-1.75942,-1.75942,0.468468,6167.457349
1,170.0,-7.24805,-7.24805,0.651376,5790.862434
2,155.0,0.16568,0.16568,0.533333,5810.258002
3,89.0,-0.14760,-0.14760,0.816667,2181.007600
4,151.0,-4.16360,-4.16360,0.712871,4692.193883
...,...,...,...,...,...
9041,150.0,-14.41313,-14.41313,0.634409,4705.308666
9042,111.0,-8.24023,-8.24023,0.619718,3290.838106
9043,151.0,-13.15953,-13.15953,0.652632,4650.711363
9044,170.0,-11.33486,-11.33486,0.654206,5415.940777


In [17]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Lipinski, Descriptors


# Create a new DataFrame to store the results
d7 = pd.DataFrame()

def calculate_polarity(peptide_sequence):
    # Create a molecule object from the peptide sequence
    peptide_molecule = Chem.MolFromSequence(peptide_sequence)

    if peptide_molecule is not None:
        # Calculate descriptors related to polarity
        num_h_acceptors = Lipinski.NumHAcceptors(peptide_molecule)
        num_h_donors = Lipinski.NumHDonors(peptide_molecule)
        #polar_surface_area = Descriptors.TPSA(peptide_molecule)

        return num_h_acceptors, num_h_donors
    else:
        print(f"Invalid peptide sequence: {peptide_sequence}")
        return None

# Apply the function to the 'Sequence' column and create new columns for polarity descriptors
#new_pos['Sequence'] = cleaned_df['Sequence']
d7[['NumHAcceptors', 'NumHDonors',]] = cleaned_df['Sequence'].apply(calculate_polarity).apply(pd.Series)

# Display the new DataFrame with polarity descriptors
d7

Unnamed: 0,NumHAcceptors,NumHDonors
0,26,36
1,30,35
2,24,33
3,18,18
4,28,29
...,...,...
9041,34,36
9042,24,26
9043,34,36
9044,36,39


In [18]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Lipinski, Descriptors


df8 = pd.DataFrame()

def calculate_amino_acid_properties(peptide_sequence):
    ng_count = peptide_sequence.count('C(N)N')
    npa_count = peptide_sequence.count('C(N)C')
    nncaa_count = peptide_sequence.count('D') + peptide_sequence.count('E')

    return ng_count, npa_count, nncaa_count
# Apply the function to the 'Sequence' column and create new columns for amino acid properties
df8[['NG', 'NPA', 'NNCAA']] = cleaned_df['Sequence'].apply(calculate_amino_acid_properties).apply(pd.Series)

# Display the DataFrame with amino acid properties
df8

Unnamed: 0,NG,NPA,NNCAA
0,0,0,0
1,0,0,1
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
9041,0,0,0
9042,0,0,0
9043,0,0,0
9044,0,0,2


In [19]:
cleaned_df.reset_index(drop=True, inplace=True)
descriptors_df.reset_index(drop=True, inplace=True)
d4.reset_index(drop=True, inplace=True)
d6.reset_index(drop=True, inplace=True)
d7.reset_index(drop=True, inplace=True)
df8.reset_index(drop=True, inplace=True)

In [20]:
ll = pd.concat([cleaned_df,descriptors_df,d4,d6,d7,df8], axis=1)
ll = ll.fillna(0)
ll

Unnamed: 0,Sequence,Class,PDB_File,Mean_RSA,Mean_Phi,Mean_Psi,%Helix,%Sheet,%Coil,RoG,...,HeavyAtomCount,LogP,MolLogP,FractionCSP3,BertzCT,NumHAcceptors,NumHDonors,NG,NPA,NNCAA
0,FWRRYKKVKKYRRWF,1,../hd//peptide_0.pdb,0.859582,-52.913333,47.360000,46.666667,0.0,20.000000,1.094001,...,162.0,-1.75942,-1.75942,0.468468,6167.457349,26,36,0,0,0
1,VDKPPYLPRVRPPRRIYNR,1,../hd//peptide_1.pdb,0.865338,-25.389474,113.342105,0.000000,0.0,0.000000,1.451317,...,170.0,-7.24805,-7.24805,0.651376,5790.862434,30,35,0,0,1
2,WWKRWKRIRRIFMMV,1,../hd//peptide_2.pdb,0.716046,-42.386667,-8.420000,80.000000,0.0,6.666667,0.808917,...,155.0,0.16568,0.16568,0.533333,5810.258002,24,33,0,0,0
3,LLCKLKCKLKL,1,../hd//peptide_3.pdb,0.829469,-41.363636,32.109091,45.454545,0.0,18.181818,0.697707,...,89.0,-0.14760,-0.14760,0.816667,2181.007600,18,18,0,0,0
4,KLKAGLAKWKAGLAKLKAGLA,1,../hd//peptide_4.pdb,0.702707,-41.447619,-23.457143,90.476190,0.0,0.000000,0.989981,...,151.0,-4.16360,-4.16360,0.712871,4692.193883,28,29,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9041,KSLSTGKSKSFFVRQTNKS,0,../hd//peptide_9041.pdb,0.903197,-33.236842,107.900000,0.000000,0.0,0.000000,1.584088,...,150.0,-14.41313,-14.41313,0.634409,4705.308666,34,36,0,0,0
9042,KSKSFFVRQTKKS,0,../hd//peptide_9042.pdb,0.899669,-17.700000,103.261538,0.000000,0.0,0.000000,1.071776,...,111.0,-8.24023,-8.24023,0.619718,3290.838106,24,26,0,0,0
9043,KSLSTGKSKSFFVRQTKKS,0,../hd//peptide_9043.pdb,0.887751,-26.436842,104.094737,0.000000,0.0,0.000000,1.553086,...,151.0,-13.15953,-13.15953,0.652632,4650.711363,34,36,0,0,0
9044,FSSEKLKARKEKKSRKQAPY,0,../hd//peptide_9044.pdb,0.701268,-43.470000,14.855000,65.000000,0.0,15.000000,1.004550,...,170.0,-11.33486,-11.33486,0.654206,5415.940777,36,39,0,0,2


In [21]:
categorical_df = ll.copy()

# Convert the "structural_class" column to categorical
categorical_df['structural_class'] = categorical_df['structural_class'].astype('category')

# Assign categorical codes to the categories
categorical_df['structural_class'] = categorical_df['structural_class'].cat.codes


In [22]:

# Then drop unnecessary columns
X = categorical_df[[ 'hydrophobic_moment','LogP','W']]
X = X.fillna(0)
y = categorical_df['Class']
y=y.astype('int')

X.columns

Index(['hydrophobic_moment', 'LogP', 'W'], dtype='object')

In [23]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd


scaler = MinMaxScaler()

# Apply Min-Max Normalization to all columns in X
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Display the normalized DataFrame
print("Normalized DataFrame (Min-Max Scaling):")
X_normalized

Normalized DataFrame (Min-Max Scaling):


Unnamed: 0,hydrophobic_moment,LogP,W
0,0.431705,0.802631,0.186667
1,0.415306,0.683278,0.000000
2,0.514877,0.844493,0.280000
3,0.188908,0.837681,0.000000
4,0.242664,0.750351,0.066667
...,...,...,...
9041,0.234318,0.527469,0.000000
9042,0.265519,0.661702,0.000000
9043,0.265519,0.554729,0.000000
9044,0.269774,0.594408,0.000000


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

In [25]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

# Create the XGBoost Classifier
xgb_classifier = XGBClassifier(n_estimators=60, learning_rate=0.2, max_depth=2,random_state=42)

# Perform 10-fold cross-validation
cv_scores = cross_val_score(xgb_classifier, X_train, y_train, cv=10)

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Fit the model on the entire training set
xgb_classifier.fit(X_train, y_train)

# Predictions on the training set
y_train_pred = xgb_classifier.predict(X_train)

# Calculate accuracy on the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", train_accuracy)

# Predictions on the test set
y_test_pred = xgb_classifier.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Calculate precision on the test set
precision = precision_score(y_test, y_test_pred)
print("Precision:", precision)

# Calculate sensitivity (recall) on the test set
recall = recall_score(y_test, y_test_pred)
print("Sensitivity (Recall):", recall)

# Calculate ROC AUC score on the test set
roc_auc = roc_auc_score(y_test, xgb_classifier.predict_proba(X_test)[:,1])
print("ROC AUC:", roc_auc)





Cross-Validation Scores: [0.83425414 0.81767956 0.8480663  0.83287293 0.8660221  0.8660221
 0.86445367 0.86307054 0.82295989 0.82849239]
Mean CV Accuracy: 0.8443893613932127
Train Accuracy: 0.8506080707573245
Test Accuracy: 0.8414364640883978
Precision: 0.8745519713261649
Sensitivity (Recall): 0.8008752735229759
ROC AUC: 0.9175841083150985


In [26]:
# Then drop unnecessary columns
X = categorical_df[[ 'hydrophobic_moment','LogP','W','S','E']]
X = X.fillna(0)
y = categorical_df['Class']
y=y.astype('int')

X.columns

Index(['hydrophobic_moment', 'LogP', 'W', 'S', 'E'], dtype='object')

In [27]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd


scaler = MinMaxScaler()

# Apply Min-Max Normalization to all columns in X
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Display the normalized DataFrame
print("Normalized DataFrame (Min-Max Scaling):")
X_normalized

Normalized DataFrame (Min-Max Scaling):


Unnamed: 0,hydrophobic_moment,LogP,W,S,E
0,0.431705,0.802631,0.186667,0.000000,0.000000
1,0.415306,0.683278,0.000000,0.000000,0.000000
2,0.514877,0.844493,0.280000,0.000000,0.000000
3,0.188908,0.837681,0.000000,0.000000,0.000000
4,0.242664,0.750351,0.066667,0.000000,0.000000
...,...,...,...,...,...
9041,0.234318,0.527469,0.000000,0.526316,0.000000
9042,0.265519,0.661702,0.000000,0.461538,0.000000
9043,0.265519,0.554729,0.000000,0.526316,0.000000
9044,0.269774,0.594408,0.000000,0.300000,0.328571


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

In [29]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

# Create the XGBoost Classifier
xgb_classifier = XGBClassifier(n_estimators=60, learning_rate=0.2, max_depth=2,random_state=42)

# Perform 10-fold cross-validation
cv_scores = cross_val_score(xgb_classifier, X_train, y_train, cv=10)

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Fit the model on the entire training set
xgb_classifier.fit(X_train, y_train)

# Predictions on the training set
y_train_pred = xgb_classifier.predict(X_train)

# Calculate accuracy on the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", train_accuracy)

# Predictions on the test set
y_test_pred = xgb_classifier.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Calculate precision on the test set
precision = precision_score(y_test, y_test_pred)
print("Precision:", precision)

# Calculate sensitivity (recall) on the test set
recall = recall_score(y_test, y_test_pred)
print("Sensitivity (Recall):", recall)

# Calculate ROC AUC score on the test set
roc_auc = roc_auc_score(y_test, xgb_classifier.predict_proba(X_test)[:,1])
print("ROC AUC:", roc_auc)





Cross-Validation Scores: [0.84530387 0.85359116 0.86740331 0.85911602 0.87707182 0.86325967
 0.87136929 0.89073306 0.8450899  0.84094053]
Mean CV Accuracy: 0.8613878636436578
Train Accuracy: 0.8682974018794914
Test Accuracy: 0.8629834254143647
Precision: 0.8784090909090909
Sensitivity (Recall): 0.8457330415754923
ROC AUC: 0.9347031054626446
