In [None]:
# Import the needed libraries
import pandas as pd
from Bio import SeqIO
from FeatureExtractionOperations import calculate_a_acid_composition, calculate_hydrophobicity, calculate_polarity, calculate_mw, calculate_pI

In [None]:
# Define the dataframes that will have the calculated feature values
unaligned_data = {"ID": [], 
                  "Unaligned Sequence": [], 
                  'A': [], 'R': [], 'N': [], 'D': [],
                  'C': [], 'E': [], 'Q': [], 'G': [],
                  'H': [], 'I': [], 'L': [], 'K': [],
                  'M': [], 'F': [], 'P': [], 'S': [],
                  'T': [], 'W': [], 'Y': [], 'V': [],
                  "Hydrophobicity (Kyte-Doolittle Scale)": [],
                  "Net Charge at pH 7.0 (Neutral)": [],
                  "Net Charge at pH 3.0 (Acidic)": [],
                  "Net Charge at pH 11.0 (Basic)": [],
                  "Isoelectric Point": [],
                  "Molecular Weight": [],
                  "Sequence Length": []} 

for seq_record in SeqIO.parse("sequences.fasta", "fasta"):
    
    unaligned_data["ID"].append(seq_record.id)
    unaligned_data["Unaligned Sequence"].append(seq_record.seq)
    
    aa_composition = calculate_a_acid_composition(str(seq_record.seq))
    for amino_acid, percent in aa_composition.items():
        unaligned_data[amino_acid].append(percent)
    
    hydrophobicity_values = calculate_hydrophobicity(str(seq_record.seq))
    unaligned_data["Hydrophobicity (Kyte-Doolittle Scale)"].append(hydrophobicity_values)
    
    charge_7 = calculate_polarity(str(seq_record.seq), 7.0)
    unaligned_data["Net Charge at pH 7.0 (Neutral)"].append(charge_7)

    charge_3 = calculate_polarity(str(seq_record.seq), 3.0)
    unaligned_data["Net Charge at pH 3.0 (Acidic)"].append(charge_3)

    charge_11 = calculate_polarity(str(seq_record.seq), 11.0)
    unaligned_data["Net Charge at pH 11.0 (Basic)"].append(charge_11)
    
    isolectric_values = calculate_pI(str(seq_record.seq))
    unaligned_data["Isoelectric Point"].append(isolectric_values)
    
    mw_values = calculate_mw(str(seq_record.seq))
    unaligned_data["Molecular Weight"].append(mw_values)
            
    sequence_length = len(seq_record.seq)
    unaligned_data["Sequence Length"].append(sequence_length)    
    
aligned_data = {"ID": [], 
                "Aligned Sequence": [],
                "Consensus Sequence": [], 
                "Profile-Based Features": [],
                "Secondary Structure Predictions": [],
                "Gap Statistics": []}

for seq_record in SeqIO.parse("aligned_sequences.fasta", "fasta"):
    aligned_data["ID"].append(seq_record.id)
    aligned_data["Aligned Sequence"].append(seq_record.seq)
    aligned_data["Consensus Sequence"].append(None)
    aligned_data["Profile-Based Features"].append(None)
    aligned_data["Secondary Structure Predictions"].append(None)
    aligned_data["Gap Statistics"].append(None)


In [None]:
unaligned_df = pd.DataFrame(unaligned_data)
unaligned_df.head()

In [None]:
aligned_df = pd.DataFrame(aligned_data)
aligned_df.head()