In [1]:
#!pip install Bio

In [7]:
import pandas as pd
from os import chdir
from Bio import SeqIO
from Bio import SeqUtils
from Bio.SeqUtils import MeltingTemp as mt
from Bio.SeqUtils import ProtParam, lcc
import warnings
from Bio import BiopythonWarning

In [8]:

warnings.simplefilter('ignore', BiopythonWarning)

In [9]:
directory = r'../inputs'

chdir(directory)



In [5]:
genetic_encoding_file = r'all_genes.fasta'

def import_all_genes():
### get sequence data
    with open(genetic_encoding_file, 'r') as data:
        gene_data = SeqIO.parse(data, 'fasta')
        gene_data_list = list(gene_data)
    return gene_data_list

In [6]:
def generate_gene_features(gene_data_list):
    all_genes_dict = dict()

    for gene in gene_data_list:
        curr_gene = dict()
        
        ### gene sequence
        curr_gene['gene_sequence'] = str(gene.seq)
        ### protein sequence
        curr_gene['protein_sequence'] = str(gene.seq.translate())
        ### gc content
        curr_gene['gc_content_total'] = SeqUtils.GC(gene.seq)
        ### gc in first, second, third codon
        curr_gene['gc_content_1'] = SeqUtils.GC123(gene.seq)[1]
        curr_gene['gc_content_2'] = SeqUtils.GC123(gene.seq)[2]
        curr_gene['gc_content_3'] = SeqUtils.GC123(gene.seq)[3]
        ### molecular weight
        curr_gene['molecular_weight'] = SeqUtils.molecular_weight(gene.seq)
        ### melting point, using nearest neighbour thermodynamics
        curr_gene['MeltingPoint_NN'] = mt.Tm_NN(gene.seq)

        ProtCalc = ProtParam.ProteinAnalysis(str(gene.seq.translate()))

        curr_gene['aromaticity'] = ProtCalc.aromaticity()
        curr_gene['isoelectric_point'] = ProtCalc.isoelectric_point()

        curr_gene['helix_perc'] = ProtCalc.secondary_structure_fraction()[0]
        curr_gene['turn_perc'] = ProtCalc.secondary_structure_fraction()[1]
        curr_gene['sheet_perc'] = ProtCalc.secondary_structure_fraction()[2]

        curr_gene['molar_extinction_coeffiecent_reduced'] = ProtCalc.molar_extinction_coefficient()[0]
        curr_gene['molar_extinction_coeffiecent_oxidized'] = ProtCalc.molar_extinction_coefficient()[1]

        curr_gene['local_complexity_coefficient_simple'] = lcc.lcc_simp(gene.seq)
        ### Searching for repeating sequences
        seq_str = str(gene.seq)
        counter = 0.0
        for ii in range(len(seq_str)-1):
            if seq_str[ii]==seq_str[ii+1]:
                counter+=1.0
        curr_gene['percent_duplicate_dna'] = counter/(len(seq_str)-1)


        all_genes_dict[gene.id] = curr_gene

    ### table summarizing genes
    gene_data_df = pd.DataFrame.from_dict(all_genes_dict, orient = 'index').reset_index().rename(columns={'index':'ORF'})
    return gene_data_df

In [7]:
def main():
    gene_data_list = import_all_genes()
    gene_data_df = generate_gene_features(gene_data_list)
    gene_data_df.to_csv(r'../inputs/bio_features.csv')
main()
