Import the needed libraries

In [3]:
import pandas as pd
from Bio import SeqIO, AlignIO
from Bio.Align import AlignInfo
from concurrent.futures import ThreadPoolExecutor
from DataPreparation.DataOperations.LabelExtractionOperations import extract_experimental
from DataPreparation.DataOperations.FeatureExtractionOperations import calculate_a_acid_composition, calculate_hydrophobicity, calculate_polarity, calculate_mw, calculate_pI

Define the unaligned dataframe that will have the calculated feature values

In [4]:
unaligned_data = {"ID": [], 
                  "Unaligned Sequence": [], 
                  'A': [], 'R': [], 'N': [], 'D': [],
                  'C': [], 'E': [], 'Q': [], 'G': [],
                  'H': [], 'I': [], 'L': [], 'K': [],
                  'M': [], 'F': [], 'P': [], 'S': [],
                  'T': [], 'W': [], 'Y': [], 'V': [],
                  "Hydrophobicity (Kyte-Doolittle Scale)": [],
                  "Net Charge at pH 7.0 (Neutral)": [],
                  "Net Charge at pH 3.0 (Acidic)": [],
                  "Net Charge at pH 11.0 (Basic)": [],
                  "Isoelectric Point": [],
                  "Molecular Weight": [],
                  "Sequence Length": []} 

for seq_record in SeqIO.parse("FASTA Data/Sequences.fasta", "fasta"):
    
    unaligned_data["ID"].append(seq_record.id)
    unaligned_data["Unaligned Sequence"].append(str(seq_record.seq))
    
    aa_composition = calculate_a_acid_composition(str(seq_record.seq))
    for amino_acid, percent in aa_composition.items():
        unaligned_data[amino_acid].append(percent)
    
    hydrophobicity_values = calculate_hydrophobicity(str(seq_record.seq))
    unaligned_data["Hydrophobicity (Kyte-Doolittle Scale)"].append(hydrophobicity_values)
    
    charge_7 = calculate_polarity(str(seq_record.seq), 7.0)
    unaligned_data["Net Charge at pH 7.0 (Neutral)"].append(charge_7)

    charge_3 = calculate_polarity(str(seq_record.seq), 3.0)
    unaligned_data["Net Charge at pH 3.0 (Acidic)"].append(charge_3)

    charge_11 = calculate_polarity(str(seq_record.seq), 11.0)
    unaligned_data["Net Charge at pH 11.0 (Basic)"].append(charge_11)
    
    isolectric_values = calculate_pI(str(seq_record.seq))
    unaligned_data["Isoelectric Point"].append(isolectric_values)
    
    mw_values = calculate_mw(str(seq_record.seq))
    unaligned_data["Molecular Weight"].append(mw_values)
            
    sequence_length = len(seq_record.seq)
    unaligned_data["Sequence Length"].append(sequence_length)    

In [5]:
unaligned_df = pd.DataFrame(unaligned_data)
unaligned_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,W,Y,V,Hydrophobicity (Kyte-Doolittle Scale),Net Charge at pH 7.0 (Neutral),Net Charge at pH 3.0 (Acidic),Net Charge at pH 11.0 (Basic),Isoelectric Point,Molecular Weight,Sequence Length
0,1GNH_A,QTDMSRKAFVFPKESDTSYVSLKAPLTKPLKAFTVCLHFYTELSST...,0.043689,0.029126,0.033981,0.043689,0.009709,0.067961,0.033981,0.082524,...,0.029126,0.038835,0.087379,"[-1.1555555555555557, -0.30000000000000004, 0....",-4.056958,20.561798,-26.616221,5.276898,23046.8368,206
1,1A3B_L,ADCGLRPLFEKKSLEDKTERELLESYI,0.037037,0.074074,0.0,0.074074,0.037037,0.185185,0.0,0.037037,...,0.0,0.037037,0.0,"[0.522222222222222, -0.06666666666666672, -0.1...",-2.201749,5.445111,-7.80789,4.768132,3183.5868,27
2,1A3B_H,IVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLL...,0.043825,0.071713,0.039841,0.063745,0.027888,0.055777,0.031873,0.079681,...,0.031873,0.039841,0.063745,"[0.36666666666666664, -0.1777777777777777, -0....",7.154793,40.985831,-28.930323,8.987683,28923.0124,251
3,1A3B_I,GDFEEIPEEYL,0.0,0.0,0.0,0.090909,0.0,0.363636,0.0,0.090909,...,0.0,0.090909,0.0,"[-1.3555555555555556, -1.4555555555555557, -0....",-5.228535,0.561204,-6.908774,4.050028,1340.3874,11
4,2A5E_A,MEPAAGSSMEPSADWLATAAARGRVEEVRALLEAGALPNAPNSYGR...,0.185897,0.108974,0.025641,0.070513,0.00641,0.070513,0.00641,0.089744,...,0.012821,0.012821,0.064103,"[0.03333333333333329, -0.5666666666666668, -0....",-5.029419,21.690186,-10.353583,5.520055,16532.4537,156


Define the aligned dataframe that will have the calculated features

In [6]:
# Read the alignment sequences
alignment = AlignIO.read("FASTA Data/Aligned_Sequences.fasta", "fasta")

# Calculate consensus
consensus = AlignInfo.SummaryInfo(alignment).dumb_consensus()

# Calculate Conservation Score
start = 0
end = alignment.get_alignment_length()
e_freq_table = {char: 0.05 for char in "ACDEFGHIKLMNPQRSTVWY"}
conservation_score = AlignInfo.SummaryInfo(alignment).information_content(start, end, e_freq_table=e_freq_table, chars_to_ignore=["-"])

# Initialize variables to store gap statistics
alignment_length = end
num_sequences = len(alignment)
gap_count_per_position = [0] * alignment_length

# Count the number of gaps at each position
for seq_record in alignment:
    for i, residue in enumerate(str(seq_record.seq)):
        if residue == "-":
            gap_count_per_position[i] += 1
            
# Calculate the percentage of gaps at each position
perc_gap_per_position = [count / num_sequences * 100 for count in gap_count_per_position]

# Calculate total number of gaps
total_gaps = sum(gap_count_per_position)

# Calculate average gap length
all_gaps = []
for seq_record in alignment:
    sequence = str(seq_record.seq)
    gaps = [gap for gap in sequence.split('-') if gap]
    gaps_length = [len(gap) for gap in gaps]
    all_gaps.extend(gaps_length)

average_gap_length = sum(all_gaps) / len(all_gaps) if all_gaps else 0

# Define the aligned dataframe that will have the calculated feature values
aligned_data = {"ID": [], 
                "Aligned Sequence": [],
                "Consensus Sequence": [str(consensus)] * num_sequences, 
                "Conservation Scores": [conservation_score] * num_sequences,
                "Percentage of Gaps Per Position": [perc_gap_per_position] * num_sequences,
                "Total Gaps in Alignment": [total_gaps] * num_sequences,
                "Average Gap Length": [average_gap_length] * num_sequences,
                "Sequence Length": [],
                "Gap Count": [],
                "Percentage Gaps": [],
                "Mutations from Consensus": []}

for seq_record in alignment:
    
    aligned_data["ID"].append(seq_record.id)
    aligned_data["Aligned Sequence"].append(str(seq_record.seq))
    
    sequence = str(seq_record.seq)
    len_sequence = len(sequence)
    gap_count = sequence.count('-')
    perc_gaps = (gap_count / len_sequence) * 100
    mutations_from_consensus = sum(c1 != c2 for c1, c2 in zip(sequence, consensus))

    aligned_data["Sequence Length"].append(len_sequence)
    aligned_data["Gap Count"].append(gap_count)
    aligned_data["Percentage Gaps"].append(perc_gaps)
    aligned_data["Mutations from Consensus"].append(mutations_from_consensus)

In [7]:
aligned_df = pd.DataFrame(aligned_data)
aligned_df.head()

Unnamed: 0,ID,Aligned Sequence,Consensus Sequence,Conservation Scores,Percentage of Gaps Per Position,Total Gaps in Alignment,Average Gap Length,Sequence Length,Gap Count,Percentage Gaps,Mutations from Consensus
0,1GNH_A,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4291,95.419168,4497
1,1A3B_L,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4470,99.3996,4497
2,1A3B_H,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4246,94.418501,4497
3,1A3B_I,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4486,99.755392,4497
4,2A5E_A,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4341,96.531021,4497


In [8]:
merged_df = pd.merge(unaligned_df, aligned_df, on="ID")
merged_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Aligned Sequence,Consensus Sequence,Conservation Scores,Percentage of Gaps Per Position,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus
0,1GNH_A,QTDMSRKAFVFPKESDTSYVSLKAPLTKPLKAFTVCLHFYTELSST...,0.043689,0.029126,0.033981,0.043689,0.009709,0.067961,0.033981,0.082524,...,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4291,95.419168,4497
1,1A3B_L,ADCGLRPLFEKKSLEDKTERELLESYI,0.037037,0.074074,0.0,0.074074,0.037037,0.185185,0.0,0.037037,...,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4470,99.3996,4497
2,1A3B_H,IVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLL...,0.043825,0.071713,0.039841,0.063745,0.027888,0.055777,0.031873,0.079681,...,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4246,94.418501,4497
3,1A3B_I,GDFEEIPEEYL,0.0,0.0,0.0,0.090909,0.0,0.363636,0.0,0.090909,...,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4486,99.755392,4497
4,2A5E_A,MEPAAGSSMEPSADWLATAAARGRVEEVRALLEAGALPNAPNSYGR...,0.185897,0.108974,0.025641,0.070513,0.00641,0.070513,0.00641,0.089744,...,----------------------------------------------...,PFVNKXVXXQXPXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,4936.92673,"[99.98710176705792, 99.98710176705792, 99.9871...",33210307,3.164685,4497,4341,96.531021,4497


Define that label dataframe that will have the calculated values

In [9]:
# Define the label dataframe that will be the prediction outputs
label_data = {"ID": [],
              "Experimental": [],
              "Resolution": [],
              "R Value": [],
              "R Free": []}

def extract_data(seq_record):
    try:
        r_value = None
        r_free = None
        seq_id = seq_record.id
        base_pdb_id = seq_id.split("_")[0]
        pdb_file = f"PDBData/{base_pdb_id}.pdb"
        experimental, res = extract_experimental(pdb_file)
        
        with open(pdb_file, "r") as f:
            for line in f:
                try:
                    if "REMARK   3   R VALUE            (WORKING SET) :" in line:
                        r_value = float(line.split()[-1])
                    elif "REMARK   3   FREE R VALUE                     :" in line:
                        r_free  = float(line.split()[-1])
                except ValueError:
                    pass
                    
        return seq_id, experimental, res, r_value, r_free
    except ValueError:
        return seq_id, None, None, None, None

with (ThreadPoolExecutor() as executor):
    for seq_id, experimental, res, r_value, r_free in executor.map(extract_data, SeqIO.parse("FASTA Data/Sequences.fasta", "fasta")):
        label_data["ID"].append(seq_id)
        label_data["Experimental"].append(experimental)
        label_data["Resolution"].append(res)
        label_data["R Value"].append(r_value)
        label_data["R Free"].append(r_free)

In [10]:
label_df = pd.DataFrame(label_data)
label_df.head()

Unnamed: 0,ID,Experimental,Resolution,R Value,R Free
0,1GNH_A,x-ray diffraction,3.0,0.239,0.273
1,1A3B_L,x-ray diffraction,1.8,0.17,0.23
2,1A3B_H,x-ray diffraction,1.8,0.17,0.23
3,1A3B_I,x-ray diffraction,1.8,0.17,0.23
4,2A5E_A,solution nmr,,,


In [12]:
final_df = pd.merge(merged_df, label_df, on="ID")
final_df.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus,Experimental,Resolution,R Value,R Free
0,1GNH_A,QTDMSRKAFVFPKESDTSYVSLKAPLTKPLKAFTVCLHFYTELSST...,0.043689,0.029126,0.033981,0.043689,0.009709,0.067961,0.033981,0.082524,...,33210307,3.164685,4497,4291,95.419168,4497,x-ray diffraction,3.0,0.239,0.273
1,1A3B_L,ADCGLRPLFEKKSLEDKTERELLESYI,0.037037,0.074074,0.0,0.074074,0.037037,0.185185,0.0,0.037037,...,33210307,3.164685,4497,4470,99.3996,4497,x-ray diffraction,1.8,0.17,0.23
2,1A3B_H,IVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLL...,0.043825,0.071713,0.039841,0.063745,0.027888,0.055777,0.031873,0.079681,...,33210307,3.164685,4497,4246,94.418501,4497,x-ray diffraction,1.8,0.17,0.23
3,1A3B_I,GDFEEIPEEYL,0.0,0.0,0.0,0.090909,0.0,0.363636,0.0,0.090909,...,33210307,3.164685,4497,4486,99.755392,4497,x-ray diffraction,1.8,0.17,0.23
4,2A5E_A,MEPAAGSSMEPSADWLATAAARGRVEEVRALLEAGALPNAPNSYGR...,0.185897,0.108974,0.025641,0.070513,0.00641,0.070513,0.00641,0.089744,...,33210307,3.164685,4497,4341,96.531021,4497,solution nmr,,,


Turn the final dataframe into a CSV file

In [13]:
final_df.to_csv("CSV Data/Dataset.csv", index=False)