In [3]:
# Import the needed libraries
import pandas as pd
from Bio import SeqIO, AlignIO
from Bio.Align import AlignInfo
from LabelExtraction import extract_experimental
from FeatureExtractionOperations import calculate_a_acid_composition, calculate_hydrophobicity, calculate_polarity, calculate_mw, calculate_pI

In [4]:
# Define the unaligned dataframe that will have the calculated feature values
unaligned_data = {"ID": [], 
                  "Unaligned Sequence": [], 
                  'A': [], 'R': [], 'N': [], 'D': [],
                  'C': [], 'E': [], 'Q': [], 'G': [],
                  'H': [], 'I': [], 'L': [], 'K': [],
                  'M': [], 'F': [], 'P': [], 'S': [],
                  'T': [], 'W': [], 'Y': [], 'V': [],
                  "Hydrophobicity (Kyte-Doolittle Scale)": [],
                  "Net Charge at pH 7.0 (Neutral)": [],
                  "Net Charge at pH 3.0 (Acidic)": [],
                  "Net Charge at pH 11.0 (Basic)": [],
                  "Isoelectric Point": [],
                  "Molecular Weight": [],
                  "Sequence Length": []} 

for seq_record in SeqIO.parse("sequences.fasta", "fasta"):
    
    unaligned_data["ID"].append(seq_record.id)
    unaligned_data["Unaligned Sequence"].append(str(seq_record.seq))
    
    aa_composition = calculate_a_acid_composition(str(seq_record.seq))
    for amino_acid, percent in aa_composition.items():
        unaligned_data[amino_acid].append(percent)
    
    hydrophobicity_values = calculate_hydrophobicity(str(seq_record.seq))
    unaligned_data["Hydrophobicity (Kyte-Doolittle Scale)"].append(hydrophobicity_values)
    
    charge_7 = calculate_polarity(str(seq_record.seq), 7.0)
    unaligned_data["Net Charge at pH 7.0 (Neutral)"].append(charge_7)

    charge_3 = calculate_polarity(str(seq_record.seq), 3.0)
    unaligned_data["Net Charge at pH 3.0 (Acidic)"].append(charge_3)

    charge_11 = calculate_polarity(str(seq_record.seq), 11.0)
    unaligned_data["Net Charge at pH 11.0 (Basic)"].append(charge_11)
    
    isolectric_values = calculate_pI(str(seq_record.seq))
    unaligned_data["Isoelectric Point"].append(isolectric_values)
    
    mw_values = calculate_mw(str(seq_record.seq))
    unaligned_data["Molecular Weight"].append(mw_values)
            
    sequence_length = len(seq_record.seq)
    unaligned_data["Sequence Length"].append(sequence_length)    

In [5]:
unaligned_df = pd.DataFrame(unaligned_data)
unaligned_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,W,Y,V,Hydrophobicity (Kyte-Doolittle Scale),Net Charge at pH 7.0 (Neutral),Net Charge at pH 3.0 (Acidic),Net Charge at pH 11.0 (Basic),Isoelectric Point,Molecular Weight,Sequence Length
0,7U7S_A,GPHMATGQDRVVALVDMDCFFVQVEQRQNPHLRNKPAVQYKSWKGG...,0.079903,0.062954,0.031477,0.043584,0.01937,0.062954,0.072639,0.070218,...,0.016949,0.01937,0.067797,"[-1.0666666666666667, -1.5222222222222221, -0....",7.438306,57.407841,-34.284027,8.905872,46176.5961,413
1,7XNC_A,LEELELDEQQRKRLEAFLTQKQKVGELKDDDFEKISELGAGNGGVV...,0.051447,0.045016,0.028939,0.054662,0.016077,0.083601,0.041801,0.080386,...,0.006431,0.025723,0.07074,"[-1.0666666666666667, -1.8777777777777775, -1....",-5.618168,42.490722,-41.404648,5.744967,34850.0523,311
2,8EXE_A,PNVLNWEQVQRLDGILSETIPIHGRGNFPTLELQPSLIVKVVRRRL...,0.031153,0.071651,0.046729,0.065421,0.024922,0.071651,0.046729,0.062305,...,0.006231,0.028037,0.074766,"[-0.47777777777777786, -0.6888888888888889, -0...",-5.359315,44.265171,-38.827512,5.822154,36856.8494,321
3,8EXE_B,VNIEFEAYSLSDNDYDGIKKLLQQLFLKAPVNTAELTEVFGFISLL...,0.04908,0.02454,0.042945,0.03681,0.02454,0.07362,0.07362,0.055215,...,0.0,0.030675,0.06135,"[0.07777777777777779, 0.03333333333333331, 0.3...",-2.282188,15.87791,-22.778298,5.2524,18339.1265,163
4,7OBT_A,GAMGSMERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSEERN...,0.103896,0.051948,0.025974,0.060606,0.004329,0.125541,0.030303,0.060606,...,0.008658,0.04329,0.051948,"[-0.24444444444444446, -0.2888888888888889, -0...",-14.917177,29.63761,-41.717017,4.744999,25920.9203,231


In [6]:
# Read the alignment sequences
alignment = AlignIO.read("aligned_sequences.fasta", "fasta")

# Calculate consensus
consensus = AlignInfo.SummaryInfo(alignment).dumb_consensus()

# Calculate Conservation Score
start = 0
end = alignment.get_alignment_length()
e_freq_table = {char: 0.05 for char in "ACDEFGHIKLMNPQRSTVWY"}
conservation_score = AlignInfo.SummaryInfo(alignment).information_content(start, end, e_freq_table=e_freq_table, chars_to_ignore=["-"])

# Initialize variables to store gap statistics
alignment_length = end
num_sequences = len(alignment)
gap_count_per_position = [0] * alignment_length

# Count the number of gaps at each position
for seq_record in alignment:
    for i, residue in enumerate(str(seq_record.seq)):
        if residue == "-":
            gap_count_per_position[i] += 1
            
# Calculate the percentage of gaps at each position
perc_gap_per_position = [count / num_sequences * 100 for count in gap_count_per_position]

# Calculate total number of gaps
total_gaps = sum(gap_count_per_position)

# Calculate average gap length
all_gaps = []
for seq_record in alignment:
    sequence = str(seq_record.seq)
    gaps = [gap for gap in sequence.split('-') if gap]
    gaps_length = [len(gap) for gap in gaps]
    all_gaps.extend(gaps_length)

average_gap_length = sum(all_gaps) / len(all_gaps) if all_gaps else 0

# Define the aligned dataframe that will have the calculated feature values
aligned_data = {"ID": [], 
                "Aligned Sequence": [],
                "Consensus Sequence": [str(consensus)] * num_sequences, 
                "Conservation Scores": [conservation_score] * num_sequences,
                "Percentage of Gaps Per Position": [perc_gap_per_position] * num_sequences,
                "Total Gaps in Alignment": [total_gaps] * num_sequences,
                "Average Gap Length": [average_gap_length] * num_sequences,
                "Sequence Length": [],
                "Gap Count": [],
                "Percentage Gaps": [],
                "Mutations from Consensus": []}

for seq_record in alignment:
    
    aligned_data["ID"].append(seq_record.id)
    aligned_data["Aligned Sequence"].append(str(seq_record.seq))
    
    sequence = str(seq_record.seq)
    len_sequence = len(sequence)
    gap_count = sequence.count('-')
    perc_gaps = (gap_count / len_sequence) * 100
    mutations_from_consensus = sum(c1 != c2 for c1, c2 in zip(sequence, consensus))

    aligned_data["Sequence Length"].append(len_sequence)
    aligned_data["Gap Count"].append(gap_count)
    aligned_data["Percentage Gaps"].append(perc_gaps)
    aligned_data["Mutations from Consensus"].append(mutations_from_consensus)

In [7]:
aligned_df = pd.DataFrame(aligned_data)
aligned_df.head()

Unnamed: 0,ID,Aligned Sequence,Consensus Sequence,Conservation Scores,Percentage of Gaps Per Position,Total Gaps in Alignment,Average Gap Length,Sequence Length,Gap Count,Percentage Gaps,Mutations from Consensus
0,7U7S_A,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1809,81.413141,2156
1,7XNC_A,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1911,86.0036,2207
2,8EXE_A,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1901,85.553555,2218
3,8EXE_B,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,2059,92.664266,2221
4,7OBT_A,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1991,89.60396,2222


In [8]:
merged_df = pd.merge(unaligned_df, aligned_df, on="ID")
merged_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Aligned Sequence,Consensus Sequence,Conservation Scores,Percentage of Gaps Per Position,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus
0,7U7S_A,GPHMATGQDRVVALVDMDCFFVQVEQRQNPHLRNKPAVQYKSWKGG...,0.079903,0.062954,0.031477,0.043584,0.01937,0.062954,0.072639,0.070218,...,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1809,81.413141,2156
1,7XNC_A,LEELELDEQQRKRLEAFLTQKQKVGELKDDDFEKISELGAGNGGVV...,0.051447,0.045016,0.028939,0.054662,0.016077,0.083601,0.041801,0.080386,...,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1911,86.0036,2207
2,8EXE_A,PNVLNWEQVQRLDGILSETIPIHGRGNFPTLELQPSLIVKVVRRRL...,0.031153,0.071651,0.046729,0.065421,0.024922,0.071651,0.046729,0.062305,...,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1901,85.553555,2218
3,8EXE_B,VNIEFEAYSLSDNDYDGIKKLLQQLFLKAPVNTAELTEVFGFISLL...,0.04908,0.02454,0.042945,0.03681,0.02454,0.07362,0.07362,0.055215,...,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,2059,92.664266,2221
4,7OBT_A,GAMGSMERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSEERN...,0.103896,0.051948,0.025974,0.060606,0.004329,0.125541,0.030303,0.060606,...,----------------------------------------------...,PPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIKH...,5143.607774,"[99.43693693693693, 99.43693693693693, 99.4369...",1716730,8.193717,2222,1991,89.60396,2222


In [9]:
# Define the label dataframe that will be the prediction outputs
label_data = {"ID": [],
              "Experimental": []}

for seq_record in SeqIO.parse("sequences.fasta", "fasta"):
    seq_id = seq_record.id
    label_data["ID"].append(seq_id)
    
    base_pdb_id = seq_id.split("_")[0]
      
    experimental_value = extract_experimental(f"PDBData/{base_pdb_id}.pdb")
    label_data["Experimental"].append(experimental_value)



In [11]:
label_df = pd.DataFrame(label_data)
label_df

Unnamed: 0,ID,Experimental
0,7U7S_A,x-ray diffraction
1,7XNC_A,x-ray diffraction
2,8EXE_A,x-ray diffraction
3,8EXE_B,x-ray diffraction
4,7OBT_A,x-ray diffraction
...,...,...
883,8EDH_C,x-ray diffraction
884,7SZB_A,x-ray diffraction
885,7SZB_B,x-ray diffraction
886,7SZB_C,x-ray diffraction
