## Importing Python Modules

In [255]:
from Bio import SeqIO
import os
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Seq import Seq
import pandas as pd
import matplotlib.pyplot as plt

## Defining Paths and Variables

In [256]:
km_path = "/Users/kaylahmarcello/Desktop/projects/amino_acids/data/" 
aliphatic_index = ["A", "V", "I", "L"]
acidic = ["D", "E"]

## Calculations

In [257]:
import csv

with open(os.path.join(km_path,"PsbA.fa")) as handle: 
        f = open('PsbA.csv', 'w')
        writer = csv.writer(f)
        header = ['name', 'proline count', 'arginine count', 'lysine count', 'r/k ratio', 'aliphatic percent sum', 'aromaticity', 'flexibility sum', 'flexibility_avg', 'gravy']
        writer.writerow(header)
        for record in SeqIO.parse(handle, "fasta"):
            print(record.id)                       # printing the gene id
            #print(vars(record))
            #print(record._seq)
            x = ProteinAnalysis(str(record._seq))  # storing the sequence in a variable
            g_count = x.count_amino_acids()["G"]   # calculating number of G residues
            print("proline count", g_count)
            r_count = x.count_amino_acids()["R"]   # calculating the number of R AA residues
            print("arginine count", r_count)   
            k_count = x.count_amino_acids()["K"]   # calculating the number of K AA residues 
            print("lysine count", k_count)
            if k_count >0:
                r_k_ratio = r_count / k_count      # ratio of R to K, avoiding zero divider 
                print("r/k ratio", r_k_ratio)
            else:
                r_k_ratio = "NA"
            aliphatic_percent_sum = 0              # calculating the percentage of total protein that are aliphatic residues
            for aa in aliphatic_index: 
                    aliphatic_count = x.get_amino_acids_percent()[aa]
                    aliphatic_percent_sum = aliphatic_percent_sum + aliphatic_count
                    #print("aliphatic count", aliphatic_count)
                    #print("aliphatic percent sum", aliphatic_percent_sum)
            print("final aliphatic percent sum", aliphatic_percent_sum)
            acidic_percentage_sum = 0             # calculating percentage of toal protein that are acidic residues
            for aa in acidic: 
                    acidic_percentage = x.get_amino_acids_percent()[aa]
                    acidic_percentage_sum = acidic_percentage_sum + acidic_percentage
                    #print("acidic percentage", acidic_percentage)
                    #print("acidic percentage sum", acidic_percentage_sum)
            print("final acidic percentage sum", acidic_percentage_sum)
            aromaticity = x.aromaticity()         # calculating aromaticity
            print("aromaticity", aromaticity)
            
            flexibility = x.flexibility()         # calculating flexibility 
            #print("flexibility", flexibility)
            flexibility_sum = sum(flexibility)
            print("flexibility sum", flexibility_sum)
            flexibility_avg = (sum(flexibility)/len(flexibility))
            print("flexibility avg", flexibility_avg)
            gravy = x.gravy()                     # calculating gravy
            print("gravy", gravy)
            data = [record.id, g_count, r_count, k_count, r_k_ratio, aliphatic_percent_sum, aromaticity, flexibility_sum, flexibility_avg, gravy]
            writer.writerow(data)
        f.close() 
            

PsbA-Acaryochloris_marina_MBIC11017.fa_2Z87P.faa.final_tree.fa___photo_hmms___fce1b145354508c00ca66b5413cbb64f6017ab19d1701ae6813f341c
proline count 31
arginine count 12
lysine count 3
r/k ratio 4.0
final aliphatic percent sum 0.3415977961432507
final acidic percentage sum 0.05234159779614325
aromaticity 0.11570247933884298
flexibility sum 348.63602380952386
flexibility avg 0.9848475248856606
gravy 0.4418732782369151
PsbA-Acaryochloris_marina_MBIC11017.fa_2Z87P.faa.final_tree.fa___photo_hmms___b006de2b636427f602bf6764c7fc74d6d31856eccace1f9c61ad1545
proline count 34
arginine count 15
lysine count 3
r/k ratio 5.0
final aliphatic percent sum 0.3190883190883191
final acidic percentage sum 0.07692307692307693
aromaticity 0.16524216524216523
flexibility sum 336.5166428571431
flexibility avg 0.9839667919799506
gravy 0.3584045584045584
PsbA-Acaryochloris_marina_MBIC11017.fa_2Z87P.faa.final_tree.fa___photo_hmms___c106e9b54c199e75274ba5126eb62ce00f3a04d1db2674417fd570de
proline count 35
arginin

KeyError: 'X'

## Visualizing df

In [None]:
df = pd.read_csv('PsbA.csv')

print(df.head(10))

In [None]:
print(df.info())

In [None]:
df.corr()  # This was for fun bc I'm learning Pandas

Next would be to plot the data with "import matplotlib.pyplot as plt"