Import the needed libraries

In [1]:
import pandas as pd
from Bio import SeqIO, AlignIO
from Bio.Align import AlignInfo
from concurrent.futures import ThreadPoolExecutor
from DataPreparation.DataOperations.LabelExtractionOperations import extract_experimental
from DataPreparation.DataOperations.FeatureExtractionOperations import calculate_a_acid_composition, calculate_hydrophobicity, calculate_polarity, calculate_mw, calculate_pI

Define the unaligned dataframe that will have the calculated feature values

In [2]:
unaligned_data = {"ID": [], 
                  "Unaligned Sequence": [], 
                  'A': [], 'R': [], 'N': [], 'D': [],
                  'C': [], 'E': [], 'Q': [], 'G': [],
                  'H': [], 'I': [], 'L': [], 'K': [],
                  'M': [], 'F': [], 'P': [], 'S': [],
                  'T': [], 'W': [], 'Y': [], 'V': [],
                  "Hydrophobicity (Kyte-Doolittle Scale)": [],
                  "Net Charge at pH 7.0 (Neutral)": [],
                  "Net Charge at pH 3.0 (Acidic)": [],
                  "Net Charge at pH 11.0 (Basic)": [],
                  "Isoelectric Point": [],
                  "Molecular Weight": [],
                  "Sequence Length": []} 

for seq_record in SeqIO.parse("FASTA Data/Sequences.fasta", "fasta"):
    
    unaligned_data["ID"].append(seq_record.id)
    unaligned_data["Unaligned Sequence"].append(str(seq_record.seq))
    
    aa_composition = calculate_a_acid_composition(str(seq_record.seq))
    for amino_acid, percent in aa_composition.items():
        unaligned_data[amino_acid].append(percent)
    
    hydrophobicity_values = calculate_hydrophobicity(str(seq_record.seq))
    unaligned_data["Hydrophobicity (Kyte-Doolittle Scale)"].append(hydrophobicity_values)
    
    charge_7 = calculate_polarity(str(seq_record.seq), 7.0)
    unaligned_data["Net Charge at pH 7.0 (Neutral)"].append(charge_7)

    charge_3 = calculate_polarity(str(seq_record.seq), 3.0)
    unaligned_data["Net Charge at pH 3.0 (Acidic)"].append(charge_3)

    charge_11 = calculate_polarity(str(seq_record.seq), 11.0)
    unaligned_data["Net Charge at pH 11.0 (Basic)"].append(charge_11)
    
    isolectric_values = calculate_pI(str(seq_record.seq))
    unaligned_data["Isoelectric Point"].append(isolectric_values)
    
    mw_values = calculate_mw(str(seq_record.seq))
    unaligned_data["Molecular Weight"].append(mw_values)
            
    sequence_length = len(seq_record.seq)
    unaligned_data["Sequence Length"].append(sequence_length)    

In [3]:
unaligned_df = pd.DataFrame(unaligned_data)
unaligned_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,W,Y,V,Hydrophobicity (Kyte-Doolittle Scale),Net Charge at pH 7.0 (Neutral),Net Charge at pH 3.0 (Acidic),Net Charge at pH 11.0 (Basic),Isoelectric Point,Molecular Weight,Sequence Length
0,2ECW_A,GSSGSSGMASSVLEMIKEEVTCPICLELLKEPVSADCNHSFCRACI...,0.047059,0.035294,0.094118,0.023529,0.082353,0.082353,0.0,0.070588,...,0.0,0.023529,0.082353,"[-0.07777777777777782, -0.12222222222222226, -...",-2.113729,9.576874,-15.657628,5.615999,9160.3887,85
1,2BHP_A,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121094,0.072266,0.027344,0.029297,0.0,0.109375,0.019531,0.087891,...,0.015625,0.039062,0.105469,"[-0.9333333333333333, -1.322222222222222, -0.7...",-7.935901,65.628447,-54.181653,5.566606,56727.0831,512
2,2BHP_B,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121807,0.072692,0.027505,0.02947,0.0,0.11002,0.019646,0.088409,...,0.015717,0.039293,0.10609,"[-0.9333333333333333, -1.322222222222222, -0.7...",-7.935901,65.628447,-54.181653,5.566606,56421.7325,509
3,2D8S_A,GSSGSSGTSITPSSQDICRICHCEGDDESPLITPCHCTGSLHFVHQ...,0.0125,0.025,0.0,0.05,0.1125,0.0625,0.05,0.0875,...,0.0125,0.0125,0.0125,"[-0.6555555555555556, -0.11111111111111113, -0...",-3.965791,9.277249,-17.728717,5.34897,8613.578,80
4,2DXB_A,VWDRTHHAKMATGIGDPQCFKGMAGKSKFNVGDRVRIKDLPDLFYT...,0.084746,0.059322,0.033898,0.076271,0.008475,0.09322,0.016949,0.067797,...,0.025424,0.050847,0.059322,"[-1.5444444444444443, -1.8000000000000003, -1....",-6.073619,15.664591,-20.444346,4.884425,13567.9938,118


Define the aligned dataframe that will have the calculated features

In [4]:
# Read the alignment sequences
alignment = AlignIO.read("FASTA Data/Aligned_Sequences.fasta", "fasta")

# Calculate consensus
consensus = AlignInfo.SummaryInfo(alignment).dumb_consensus()

# Calculate Conservation Score
start = 0
end = alignment.get_alignment_length()
e_freq_table = {char: 0.05 for char in "ACDEFGHIKLMNPQRSTVWY"}
conservation_score = AlignInfo.SummaryInfo(alignment).information_content(start, end, e_freq_table=e_freq_table, chars_to_ignore=["-"])

# Initialize variables to store gap statistics
alignment_length = end
num_sequences = len(alignment)
gap_count_per_position = [0] * alignment_length

# Count the number of gaps at each position
for seq_record in alignment:
    for i, residue in enumerate(str(seq_record.seq)):
        if residue == "-":
            gap_count_per_position[i] += 1
            
# Calculate the percentage of gaps at each position
perc_gap_per_position = [count / num_sequences * 100 for count in gap_count_per_position]

# Calculate total number of gaps
total_gaps = sum(gap_count_per_position)

# Calculate average gap length
all_gaps = []
for seq_record in alignment:
    sequence = str(seq_record.seq)
    gaps = [gap for gap in sequence.split('-') if gap]
    gaps_length = [len(gap) for gap in gaps]
    all_gaps.extend(gaps_length)

average_gap_length = sum(all_gaps) / len(all_gaps) if all_gaps else 0

# Define the aligned dataframe that will have the calculated feature values
aligned_data = {"ID": [], 
                "Aligned Sequence": [],
                "Consensus Sequence": [str(consensus)] * num_sequences, 
                "Conservation Scores": [conservation_score] * num_sequences,
                "Percentage of Gaps Per Position": [perc_gap_per_position] * num_sequences,
                "Total Gaps in Alignment": [total_gaps] * num_sequences,
                "Average Gap Length": [average_gap_length] * num_sequences,
                "Sequence Length": [],
                "Gap Count": [],
                "Percentage Gaps": [],
                "Mutations from Consensus": []}

for seq_record in alignment:
    
    aligned_data["ID"].append(seq_record.id)
    aligned_data["Aligned Sequence"].append(str(seq_record.seq))
    
    sequence = str(seq_record.seq)
    len_sequence = len(sequence)
    gap_count = sequence.count('-')
    perc_gaps = (gap_count / len_sequence) * 100
    mutations_from_consensus = sum(c1 != c2 for c1, c2 in zip(sequence, consensus))

    aligned_data["Sequence Length"].append(len_sequence)
    aligned_data["Gap Count"].append(gap_count)
    aligned_data["Percentage Gaps"].append(perc_gaps)
    aligned_data["Mutations from Consensus"].append(mutations_from_consensus)

In [5]:
aligned_df = pd.DataFrame(aligned_data)
aligned_df.head()

Unnamed: 0,ID,Aligned Sequence,Consensus Sequence,Conservation Scores,Percentage of Gaps Per Position,Total Gaps in Alignment,Average Gap Length,Sequence Length,Gap Count,Percentage Gaps,Mutations from Consensus
0,2ECW_A,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5815,98.559322,5897
1,2BHP_A,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5388,91.322034,5896
2,2BHP_B,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5391,91.372881,5896
3,2D8S_A,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5820,98.644068,5896
4,2DXB_A,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5782,98.0,5900


In [6]:
merged_df = pd.merge(unaligned_df, aligned_df, on="ID")
merged_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Aligned Sequence,Consensus Sequence,Conservation Scores,Percentage of Gaps Per Position,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus
0,2ECW_A,GSSGSSGMASSVLEMIKEEVTCPICLELLKEPVSADCNHSFCRACI...,0.047059,0.035294,0.094118,0.023529,0.082353,0.082353,0.0,0.070588,...,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5815,98.559322,5897
1,2BHP_A,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121094,0.072266,0.027344,0.029297,0.0,0.109375,0.019531,0.087891,...,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5388,91.322034,5896
2,2BHP_B,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121807,0.072692,0.027505,0.02947,0.0,0.11002,0.019646,0.088409,...,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5391,91.372881,5896
3,2D8S_A,GSSGSSGTSITPSSQDICRICHCEGDDESPLITPCHCTGSLHFVHQ...,0.0125,0.025,0.0,0.05,0.1125,0.0625,0.05,0.0875,...,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5820,98.644068,5896
4,2DXB_A,VWDRTHHAKMATGIGDPQCFKGMAGKSKFNVGDRVRIKDLPDLFYT...,0.084746,0.059322,0.033898,0.076271,0.008475,0.09322,0.016949,0.067797,...,----------------------------------------------...,GXGXXXXSXPLXXVXXVXXXLXSPEXXRXXXXXXXXXXXXXXXXXX...,6291.419023,"[99.97862943439236, 99.96438239065394, 99.9501...",79703822,2.616737,5900,5782,98.0,5900


Define that label dataframe that will have the calculated values

In [8]:
# Define the label dataframe that will be the prediction outputs
label_data = {"ID": [],
              "Experimental": [],
              "Resolution": [],
              "R Value": [],
              "R Free": []}

def extract_data(seq_record):
    try:
        r_value = None
        r_free = None
        seq_id = seq_record.id
        base_pdb_id = seq_id.split("_")[0]
        pdb_file = f"PDB Data/{base_pdb_id}.pdb"
        experimental, res = extract_experimental(pdb_file)
        
        with open(pdb_file, "r") as f:
            for line in f:
                try:
                    if "REMARK   3   R VALUE            (WORKING SET) :" in line:
                        r_value = float(line.split()[-1])
                    elif "REMARK   3   FREE R VALUE                     :" in line:
                        r_free  = float(line.split()[-1])
                except ValueError:
                    pass
                    
        return seq_id, experimental, res, r_value, r_free
    except ValueError:
        return seq_id, None, None, None, None

with (ThreadPoolExecutor() as executor):
    for seq_id, experimental, res, r_value, r_free in executor.map(extract_data, SeqIO.parse("FASTA Data/Sequences.fasta", "fasta")):
        label_data["ID"].append(seq_id)
        label_data["Experimental"].append(experimental)
        label_data["Resolution"].append(res)
        label_data["R Value"].append(r_value)
        label_data["R Free"].append(r_free)

In [9]:
label_df = pd.DataFrame(label_data)
label_df.head()

Unnamed: 0,ID,Experimental,Resolution,R Value,R Free
0,2ECW_A,solution nmr,,,
1,2BHP_A,x-ray diffraction,1.8,0.148,0.176
2,2BHP_B,x-ray diffraction,1.8,0.148,0.176
3,2D8S_A,solution nmr,,,
4,2DXB_A,x-ray diffraction,2.25,0.171,0.198


In [10]:
final_df = pd.merge(merged_df, label_df, on="ID")
final_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus,Experimental,Resolution,R Value,R Free
0,2ECW_A,GSSGSSGMASSVLEMIKEEVTCPICLELLKEPVSADCNHSFCRACI...,0.047059,0.035294,0.094118,0.023529,0.082353,0.082353,0.0,0.070588,...,79703822,2.616737,5900,5815,98.559322,5897,solution nmr,,,
1,2BHP_A,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121094,0.072266,0.027344,0.029297,0.0,0.109375,0.019531,0.087891,...,79703822,2.616737,5900,5388,91.322034,5896,x-ray diffraction,1.8,0.148,0.176
2,2BHP_B,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121807,0.072692,0.027505,0.02947,0.0,0.11002,0.019646,0.088409,...,79703822,2.616737,5900,5391,91.372881,5896,x-ray diffraction,1.8,0.148,0.176
3,2D8S_A,GSSGSSGTSITPSSQDICRICHCEGDDESPLITPCHCTGSLHFVHQ...,0.0125,0.025,0.0,0.05,0.1125,0.0625,0.05,0.0875,...,79703822,2.616737,5900,5820,98.644068,5896,solution nmr,,,
4,2DXB_A,VWDRTHHAKMATGIGDPQCFKGMAGKSKFNVGDRVRIKDLPDLFYT...,0.084746,0.059322,0.033898,0.076271,0.008475,0.09322,0.016949,0.067797,...,79703822,2.616737,5900,5782,98.0,5900,x-ray diffraction,2.25,0.171,0.198


Turn the final dataframe into a CSV file

In [11]:
final_df.to_csv("CSV Data/Dataset.csv", index=False)