# 1 FASTA File

obtain FASTA files of protein sequences of the following 7 species from NCBI Genome Database.
- Arabidopsis thaliana
- Human
- Mouse
- Zebrafish
- Chlamydomonas (green alga)
- Cyanidioschizone (red alga)
- Budding yeast

# 2 Proteins Lenght Comparison
Proteins are composed of a series of amino acids, but their length and composition vary from protein to protein. Proteins vary in length, but by looking at the distribution of the length of proteins encoded by a given genome, we can understand the overall picture (proteome) of the proteins of that species. Compare the distribution of protein lengths of each species using the following visualization method. Consider whether converting the protein lengths to logarithmic values changes the interpretation.
- Density distribution
- Box-and-whisker plot
- Violin plot

In [None]:
import os

def parse_fasta_folder(input_folder):
    """
    input_folder - folder containing .faa files
    return - DataFrame with 'Species', 'Protein ID', 'Protein Length'
    """
    parsed_seqs = {}

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.faa'):
            species_name = filename[:-4]  # Assuming species name is the filename without extension
            file_path = os.path.join(input_folder, filename)
            
            with open(file_path) as f:
                curr_seq_id = None
                curr_seq = []

                for line in f:
                    line = line.strip()

                    if line.startswith(">"):
                        if curr_seq_id is not None:
                            parsed_seqs[curr_seq_id] = (species_name, ''.join(curr_seq))

                        curr_seq_id = line[1:]
                        curr_seq = []
                        continue

                    curr_seq.append(line)

                # Add the final sequence to the dict
                if curr_seq_id is not None:
                    parsed_seqs[curr_seq_id] = (species_name, ''.join(curr_seq))

    print(f"Total sequences parsed: {len(parsed_seqs)}")
    return parsed_seqs


In [None]:
def convert_to_dataframe(parsed_seqs):
    """
    Convert the parsed sequences dictionary to a DataFrame.
    """
    data = []
    for seq_id, (species, seq) in parsed_seqs.items():
        data.append({'Protein ID': seq_id, 'Species': species, 'Protein Length': len(seq)})
    
    return pd.DataFrame(data)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set Seaborn style and palette
sns.set(style='whitegrid', palette='Set2')

def plot_protein_length_distribution(df):
    # Add a column for log-transformed protein lengths
    df['Log Protein Length'] = np.log10(df['Protein Length'])
    
    # Set up the plotting area
    plt.figure(figsize=(15, 10))

    # Density Distribution
    plt.subplot(2, 2, 1)
    sns.kdeplot(data=df, x='Protein Length', hue='Species', common_norm=False)
    plt.title('Density Distribution of Protein Lengths')
    plt.legend(title='Species', bbox_to_anchor=(1.05, 1), loc='upper left')  # Make the legend clearer and outside

    plt.subplot(2, 2, 2)
    sns.kdeplot(data=df, x='Log Protein Length', hue='Species', common_norm=False)
    plt.title('Density Distribution of Log Protein Lengths')
    plt.legend(title='Species', bbox_to_anchor=(1.05, 1), loc='upper left')

    # Box-and-Whisker Plot
    plt.subplot(2, 2, 3)
    sns.boxplot(data=df, x='Species', y='Protein Length')
    plt.title('Box-and-Whisker Plot of Protein Lengths')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

    plt.subplot(2, 2, 4)
    sns.boxplot(data=df, x='Species', y='Log Protein Length')
    plt.title('Box-and-Whisker Plot of Log Protein Lengths')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

    # Violin Plot
    plt.figure(figsize=(12, 6))
    sns.violinplot(data=df, x='Species', y='Protein Length')
    plt.title('Violin Plot of Protein Lengths')
    plt.xticks(rotation=45)
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.violinplot(data=df, x='Species', y='Log Protein Length')
    plt.title('Violin Plot of Log Protein Lengths')
    plt.xticks(rotation=45)
    plt.show()


# After you have the dataframe from the parsed fasta folder
parsed_seqs = parse_fasta_folder('assignment01_data')
df = convert_to_dataframe(parsed_seqs)

# Call the function to generate the plots
plot_protein_length_distribution(df)


# 3 Relation between Amino content and Lenght
Proteins are composed of a series of 20 different amino acids, but the amino acid content of each amino acid is different for each protein. Find out if the amino acid content of each protein is related to the length of the protein.
- Scatter Plot
- Summarize trends (linear regression, lowess)
- Visualization of amino acid composition profiles (heat maps)
- Visualization of amino acid composition profiles (hierarchical clustering)
- Visualization of amino acid composition profiles (Principal Component Analysis)

# 4 Amino acid composition comparison
Compare the seven species in terms of amino acid composition of proteins.
- Amino acid composition at representative gene lengths
- Comparison of amino acid composition of species (heat map)
- Comparison of amino acid composition of species (hierarchical clustering)
- Comparison of amino acid compositions of species (Principal Component Analysis)