In [71]:
from Bio import SeqIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [72]:
# Initialize an empty DataFrame
df_fasta_file = pd.DataFrame(columns=["Identifier", "Gene Name", "Description", "Protein Name", "Length", "Sequence"])

# Read in the fasta file
fasta_file = "BacillusClausiiTransmembraneProteins.fasta"

# Read in the fasta file and store the data in the dataframe
for record in SeqIO.parse(fasta_file, "fasta"):
    identifier = record.id
    description = record.description
    gene_name = description.split("GN=")[1].split(" ")[0]  # Define gene_name after description
    protein_name = description.split(" ")[1]
    length = len(record.seq)
    sequence = str(record.seq)
    
    # Append the row to the dataframe
    df_fasta_file.loc[len(df_fasta_file)] = [identifier, gene_name, description, protein_name, length, sequence]

# Display the DataFrame
df


Unnamed: 0,Identifier,Gene Name,Description,Protein Name,Length,Sequence
0,sp|Q5WB61|MNTP_SHOC1,mntP,sp|Q5WB61|MNTP_SHOC1 Putative manganese efflux...,Putative,180,MHEFVTICIMAAALGMDAFSVALGMGMLKLSGKQIFRIGLTIGLFH...
1,sp|Q5WB72|ATP6_SHOC1,atpB,sp|Q5WB72|ATP6_SHOC1 ATP synthase subunit a OS...,ATP,238,MPEHHQYQFEFMGLLFNGTTMITTTIAMAIVVIITVIGCRKLAMRP...
2,sp|Q5WB73|ATPL_SHOC1,atpE,sp|Q5WB73|ATPL_SHOC1 ATP synthase subunit c OS...,ATP,71,MTELAIGIAAGLAAIGGAIGVAIIVKAVIEGTARQPEQRGTLQTLM...
3,sp|Q5WB74|ATPF_SHOC1,atpF,sp|Q5WB74|ATPF_SHOC1 ATP synthase subunit b OS...,ATP,161,MVIEWGTALYQLLAFAVLLLILSKFALKPLLGVMQKRQDMINEQID...
4,sp|Q5WCX5|UPPP2_SHOC1,uppP2,sp|Q5WCX5|UPPP2_SHOC1 Undecaprenyl-diphosphata...,Undecaprenyl-diphosphatase,275,MDVWEWVVAAILGLVEGLTEYAPVSSTGHMIIVDDLWLKSSELVGS...
5,sp|Q5WDH2|LGT_SHOC1,lgt,sp|Q5WDH2|LGT_SHOC1 Phosphatidylglycerol--prol...,Phosphatidylglycerol--prolipoprotein,277,MEEQIEPIDRVFVQLGPIAIYWYAVLILLGVAVGYFMARRESVKRG...
6,sp|Q5WDX4|COXX_SHOC1,ctaB,sp|Q5WDX4|COXX_SHOC1 Protoheme IX farnesyltran...,Protoheme,311,MRTEKIDKSIHNASLATPKQAFSQVLSETLKTGIIKSNLLAMAAGL...
7,sp|Q5WDX6|Y2900_SHOC1,ABC2900,sp|Q5WDX6|Y2900_SHOC1 UPF0344 protein ABC2900 ...,UPF0344,126,MNSGGFIQENFSIFQASHEGSWAILAILFLVAYFLFRGGKSKAGTI...
8,sp|Q5WDX9|TCYP_SHOC1,ABC2897,sp|Q5WDX9|TCYP_SHOC1 L-cystine uptake protein ...,L-cystine,465,MDLFLTLLIIAIVLAVAGLLFYMQKKHVSFSIRVLLALGAGVVYGL...
9,sp|Q5WEF9|Y2716_SHOC1,ABC2716,sp|Q5WEF9|Y2716_SHOC1 UPF0756 membrane protein...,UPF0756,157,MLSQSTLFLLLLMAIALIAKNQSLIIAISVLLLIKWTGLGDKVFPL...


In [73]:
# Initialize an empty DataFrame
df_genbank_file = pd.DataFrame(columns=["Identifier", "Gene Name", "Description", "Protein Name", "Length", "Sequence"])

# Read in the data from the genbank file
genbank_file = "Bacillus clausii.gb" 
for record in SeqIO.parse(genbank_file, "genbank"):
    for feature in record.features:
        if feature.type == "CDS":
            identifier = feature.qualifiers["locus_tag"]
            if 'gene' in feature.qualifiers: #check if gene name is available - there are some CDS features without gene name
                gene_name = feature.qualifiers['gene'][0]
            else:
                gene_name = 'Gene name missing, locus tag: ' + feature.qualifiers['locus_tag'][0]
            description = feature.qualifiers
            if 'product' in feature.qualifiers:
                protein_name = feature.qualifiers['product'][0]
            else:
                protein_name = 'Product name missing'
            if 'translation' in feature.qualifiers:
                sequence = feature.qualifiers["translation"][0]
                length = len(feature.qualifiers["translation"][0])    
            else:
                sequence = 'Translation missing'
                length = 'Length missing'
            
            # Append the row to the dataframe
            df_genbank_file.loc[len(df_genbank_file)] = [identifier, gene_name, description, protein_name, length, sequence]

df_genbank_file

Unnamed: 0,Identifier,Gene Name,Description,Protein Name,Length,Sequence
0,[ABC0001],dnaA,"{'gene': ['dnaA'], 'locus_tag': ['ABC0001'], '...",chromosome replication initiator protein DnaA,451,MENIDDLWNKVLEEMKKKVSKPSYETWLRATKANALQNNDTIIVTA...
1,[ABC0002],dnaN,"{'gene': ['dnaN'], 'locus_tag': ['ABC0002'], '...",DNA-directed DNA polymerase III beta subunit DnaN,380,MHVIIERNRMVHDVQHVAKAVSSRTTIPILTGIKLVADANGLTLTG...
2,[ABC0003],"Gene name missing, locus tag: ABC0003","{'locus_tag': ['ABC0003'], 'codon_start': ['1'...",conserved hypothetical protein,72,MEQIRIETEYITLGQLLKEIGAIDTGGMAKWYLSEHTPRVNSEEEN...
3,[ABC0004],recF,"{'gene': ['recF'], 'locus_tag': ['ABC0004'], '...",DNA replication and repair protein RecF,372,MIIHTLELSSYRNYSKTAVVFGEKINVFVGENAQGKTNLLEAIYVV...
4,[ABC0005],"Gene name missing, locus tag: ABC0005","{'locus_tag': ['ABC0005'], 'codon_start': ['1'...",conserved hypothetical protein,98,MYIHIGGEVILPAIEIIAILPYAEGELAKDTAVFLHEWDTKHDCKK...
...,...,...,...,...,...,...
4103,[ABC4117],thdF,"{'gene': ['thdF'], 'locus_tag': ['ABC4117'], '...",tRNA modification GTPase,458,MEMDTIAAISTALGEGAIGIVRLSGDQAIAIGDKLFKGTKRLEDTP...
4104,[ABC4118],jag,"{'gene': ['jag'], 'locus_tag': ['ABC4118'], 'c...",spoIIIJ-associated protein,211,MAHTFKGRTVEEAVANAVQHLGTTEEQLVYEVIEQPQKGFFGLFGG...
4105,[ABC4119],spoIIIJ,"{'gene': ['spoIIIJ'], 'locus_tag': ['ABC4119']...",stage III sporulation protein J,284,MKKTGWLLVLASMLLFLSGCFSVNEPVTAESEGIWNSFFVYPLSRL...
4106,[ABC4120],rnpA,"{'gene': ['rnpA'], 'locus_tag': ['ABC4120'], '...",ribonuclease P protein component,122,MKKEQRIKKNREFSAVFKKGSSMANRQFVLYVLPKEGQDRLRLGLS...


In [74]:
# Compare the two dataframes, df_fasta_file and df_genbank_file, to see if they share some common proteins
df_common = pd.merge(df_fasta_file, df_genbank_file, on="Gene Name")
df_common


Unnamed: 0,Identifier_x,Gene Name,Description_x,Protein Name_x,Length_x,Sequence_x,Identifier_y,Description_y,Protein Name_y,Length_y,Sequence_y
0,sp|Q5WB72|ATP6_SHOC1,atpB,sp|Q5WB72|ATP6_SHOC1 ATP synthase subunit a OS...,ATP,238,MPEHHQYQFEFMGLLFNGTTMITTTIAMAIVVIITVIGCRKLAMRP...,[ABC3857],"{'gene': ['atpB'], 'locus_tag': ['ABC3857'], '...",F0F1-type ATP synthase A chain,238,MPEHHQYQFEFMGLLFNGTTMITTTIAMAIVVIITVIGCRKLAMRP...
1,sp|Q5WB73|ATPL_SHOC1,atpE,sp|Q5WB73|ATPL_SHOC1 ATP synthase subunit c OS...,ATP,71,MTELAIGIAAGLAAIGGAIGVAIIVKAVIEGTARQPEQRGTLQTLM...,[ABC3856],"{'gene': ['atpE'], 'locus_tag': ['ABC3856'], '...",F0F1-type ATP synthase C chain,71,MTELAIGIAAGLAAIGGAIGVAIIVKAVIEGTARQPEQRGTLQTLM...
2,sp|Q5WB74|ATPF_SHOC1,atpF,sp|Q5WB74|ATPF_SHOC1 ATP synthase subunit b OS...,ATP,161,MVIEWGTALYQLLAFAVLLLILSKFALKPLLGVMQKRQDMINEQID...,[ABC3855],"{'gene': ['atpF'], 'locus_tag': ['ABC3855'], '...",F0F1-type ATP synthase B chain,161,MVIEWGTALYQLLAFAVLLLILSKFALKPLLGVMQKRQDMINEQID...
3,sp|Q5WDH2|LGT_SHOC1,lgt,sp|Q5WDH2|LGT_SHOC1 Phosphatidylglycerol--prol...,Phosphatidylglycerol--prolipoprotein,277,MEEQIEPIDRVFVQLGPIAIYWYAVLILLGVAVGYFMARRESVKRG...,[ABC3054],"{'gene': ['lgt'], 'locus_tag': ['ABC3054'], 'E...",prolipoprotein diacylglyceryl transferase,277,MEEQIEPIDRVFVQLGPIAIYWYAVLILLGVAVGYFMARRESVKRG...
4,sp|Q5WDX4|COXX_SHOC1,ctaB,sp|Q5WDX4|COXX_SHOC1 Protoheme IX farnesyltran...,Protoheme,311,MRTEKIDKSIHNASLATPKQAFSQVLSETLKTGIIKSNLLAMAAGL...,[ABC2902],"{'gene': ['ctaB'], 'locus_tag': ['ABC2902'], '...",cytochrome oxidase assembly factor,311,MRTEKIDKSIHNASLATPKQAFSQVLSETLKTGIIKSNLLAMAAGL...
5,sp|Q5WFD1|CTAA_SHOC1,ctaA,sp|Q5WFD1|CTAA_SHOC1 Heme A synthase OS=Shouch...,Heme,301,MHKGLKRLGVITSLGVLLVLIQGALVTNTGSGEGCGQTWPLCFGQV...,[ABC2394],"{'gene': ['ctaA'], 'locus_tag': ['ABC2394'], '...",cytochrome caa3 oxidase controlling protein,301,MHKGLKRLGVITSLGVLLVLIQGALVTNTGSGEGCGQTWPLCFGQV...
6,sp|Q5WFG7|MRAY_SHOC1,mraY,sp|Q5WFG7|MRAY_SHOC1 Phospho-N-acetylmuramoyl-...,Phospho-N-acetylmuramoyl-pentapeptide-transferase,325,MEEWTLLFVLILSFAAAVIMSPLFIPFLRKLKFGQSIREEGPKSHQ...,[ABC2358],"{'gene': ['mraY'], 'locus_tag': ['ABC2358'], '...",phospho-N-acetylmuramoyl-pentapeptide- transfe...,325,MEEWTLLFVLILSFAAAVIMSPLFIPFLRKLKFGQSIREEGPKSHQ...
7,sp|Q5WGY8|RESA_SHOC1,resA,sp|Q5WGY8|RESA_SHOC1 Probable thiol-disulfide ...,Probable,177,MGKSKKKRSIIRFTVLFAIVCAIGYTIYANAASEQGAVKVGEPATN...,[ABC1832],"{'gene': ['resA'], 'locus_tag': ['ABC1832'], '...",thiol-disulfide oxidoreductase ResA,177,MGKSKKKRSIIRFTVLFAIVCAIGYTIYANAASEQGAVKVGEPATN...
