# Reading a Single Sequence Fasta File

In [8]:
# import required libraries
from Bio import SeqIO
import os
import pandas as pd

In [2]:
DATA_PATH = r'C:\Users\user\Downloads\Bioinformatics'

seq_object = SeqIO.read(os.path.join(DATA_PATH, 'NG_047557.1.fna'), 'fasta')

In [3]:
type(seq_object)

Bio.SeqRecord.SeqRecord

# Exploring the Fasta File

In [5]:
# explore the data
seq_id = seq_object.id # check the id of the sequence

print(seq_id)

NG_047557.1


In [6]:
# check the description
seq_description = seq_object.description # check the description of the fasta file

print(seq_description)

NG_047557.1 Staphylococcus aureus N315 bleO gene for bleomycin binding protein, complete CDS


In [7]:
sequence = seq_object.seq  # printing the sequence of the gene
seq_length = len(sequence) #printing the length of the sequence 

 
print(sequence)
print(seq_length)

CGGGCCATTTTGCGTAATAAGAAAAAGGATTAATTATGAGCGAATTGAATTAATAATAAGGTAATAGATTTACATTAGAAAATGAAAGGGGATTTTATGCGTGAGAATGTTACAGTCTATCCCGGCATTGCCAGTCGGGGATATTAAAAAGAGTATAGGTTTTTATTGCGATAAACTAGGTTTCACTTTGGTTCACCATGAAGATGGATTCGCAGTTCTAATGTGTAATGAGGTTCGGATTCATCTATGGGAGGCAAGTGATGAAGGCTGGCGCTCTCGTAGTAATGATTCACCGGTTTGTACAGGTGCGGAGTCGTTTATTGCTGGTACTGCTAGTTGCCGCATTGAAGTAGAGGGAATTGATGAATTATATCAACATATTAAGCCTTTGGGCATTTTGCACCCCAATACATCATTAAAAGATCAGTGGTGGGATGAACGAGACTTTGCAGTAATTGATCCCGACAACAATTTGATTAGCTTTTTTCAACAAATAAAAAGCTAAAATCTATTATTAATCTGTTCAGCAATCGGGCGCGATTGCTGAATAAAAGATACGAGAGACCTCTCTTGTATCTTTTTTATTTTGAGTGGTTTTGTCCGTT
605


# Reading a Multi-Sequence Fasta File

In [9]:
seq_objects = SeqIO.parse(os.path.join(DATA_PATH, 'multi-fasta.fa'), 'fasta')

sequences = [seq for seq in seq_objects]

print(len(sequences))

5


In [12]:
for i in range(len(sequences)):
    record = sequences[i]
    print(record.id)
    print(record.description)
    print(len(record.seq))

CP029082.1
CP029082.1 Staphylococcus aureus strain AR465 chromosome, complete genome
2911287
CP030138.1
CP030138.1 Staphylococcus aureus strain M48 chromosome, complete genome
3050015
CP039157.1
CP039157.1 Staphylococcus aureus strain P10 chromosome, complete genome
2970728
CP039167.1
CP039167.1 Staphylococcus aureus strain R50 chromosome, complete genome
2866643
CP013957.1
CP013957.1 Staphylococcus aureus strain V521, complete genome
3085555


In [14]:
# create a file to keep the ids and the length of the sequences
for i in range(len(sequences)):
    record = sequences[i]
    seq_ids = [id for id in record.id]
    seq_lengths = [len(seq) for seq in record.seq]

In [15]:
print(seq_ids)
print(seq_lengths)

['C', 'P', '0', '1', '3', '9', '5', '7', '.', '1']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 