In [None]:
!pip install biopython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from Bio import SeqIO

##How to import fasta files from ncbi

Kindly visit the following link

https://www.ncbi.nlm.nih.gov/nuccore

---



In [None]:
record = SeqIO.read("/content/f1.fasta","fasta")

In [None]:
record.seq

Seq('CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTA...TAG')

###Reverse Complement

In [None]:
records = [rec.reverse_complement(id="rc_"+rec.id, description = "reverse complement") \
          for rec in SeqIO.parse("/content/f1.fasta","fasta")]

In [None]:
SeqIO.write(records, "rev_comp.fasta", "fasta")

1

In [None]:
output_rev_comp = SeqIO.read("/content/f1.fasta","fasta")

In [None]:
output_rev_comp.seq

Seq('CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTA...TAG')

### Finding promotor regions in a sequece reqires genbank files of the same nucleotide

In [None]:
def get_gene_feature_location(seq_record):
    # Loop over the features
    for feature in genome_record.features:
      if feature.type == "gene":
            return feature
    # Could not find it
    return None



In [None]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

in_gbk="/content/gen1.gb"
genome_record = SeqIO.read("/content/gen1.gb", "genbank")
gene_feature = get_gene_feature_location(genome_record)

print(gene_feature)
print(gene_feature.location)



type: gene
location: [<0:23](+)
qualifiers:
    Key: gene, Value: ['S']

[<0:23](+)


In [None]:
prom_len=50
file_out = "prom_out.fna"
prom_out = ""   

GBrecord = next(SeqIO.parse(in_gbk, "genbank"))
for feature in GBrecord.features:
    if feature.type =="source":
        db_extract = feature.qualifiers['db_xref'][0]
        my_start = gene_feature.location._start.position # Identifies the start position of the gene on the sense strand (5' to 3' irrespective of actual coding strand).
        my_end = gene_feature.location._end.position # Identifies the end position of the gene on the sense strand (5' to 3' irrespective of actual coding strand).
        start_1000 = my_start - prom_len
        end_1000 = my_end + prom_len
        if feature.strand == -1:
            feat_loc = str(gene_feature.location)
            my_prom = GBrecord[my_end:end_1000].reverse_complement()
            prom_out += "> Promoter rev_comp" + "___" + db_extract + "___" + feat_loc + "\n"
            prom_out += my_prom.seq.__str__() + "\n\n"
            prom_out

        elif feature.strand == 1:
            feat_loc = str(gene_feature.location)
            my_prom = GBrecord[start_1000:my_start]
            prom_out += "> Promoter" + "___" + db_extract + "___" + feat_loc + "\n"
            prom_out += my_prom.seq.__str__()+"\n\n"
            print(prom_out)

file=open(file_out, 'w')
file.write(prom_out)
file.close()

> Promoter___taxon:9606___[<0:23](+)





###Translation converting to protein

In [None]:
tran_1 = SeqIO.read("/content/f1.fasta", "fasta")

In [None]:
trans_output =tran_1.translate()
trans_output.seq

Seq('LVSRRLCGSNLVDALFLLCERNGFYRPNDGIVDQCCINTCTTNQLLNYCN*')

###ORF: OPEN READING FRAMES EXTRACTION

In [None]:
table = 11

In [None]:
min_pro_len = 160

In [None]:
for strand, nuc in [(+1, record.seq), (-1, record.seq.reverse_complement())]:
    for frame in range(3):
         length = 3 * ((len(record)-frame) // 3) #Multiple of three
         for pro in nuc[frame:frame+length].translate(table).split("*"):
             if len(pro) >= min_pro_len:
                 print("%s...%s - length %i, strand %i, frame %i" \
                       % (pro[:30], pro[-3:], len(pro), strand, frame))

Alignment 2 seqs


In [None]:
from Bio import AlignIO
alignment = AlignIO.read("/content/f1.fasta", "fasta")
print("Alignment length %i" % alignment.get_alignment_length())

Alignment length 153


In [None]:
from Bio import pairwise2

In [None]:
from Bio import SeqIO
for seq_record_1 in SeqIO.parse("/content/f1.fasta", "fasta"):
    print(seq_record_1.id)
    print(repr(seq_record_1.seq))
    print(len(seq_record_1))

for seq_record_2 in SeqIO.parse("/content/f1.fasta", "fasta"):
    print(seq_record_2.id)
    print(repr(seq_record_2.seq))
    print(len(seq_record_2))



seq_1 = seq_record_1.seq
print(seq_1)

seq_2 = seq_record_2.seq
print(seq_2)

AY823488.1
Seq('CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTA...TAG')
153
AY823488.1
Seq('CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTA...TAG')
153
CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTATGTGAACGGAATGGCTTCTACAGACCCAACGACGGCATCGTGGACCAGTGCTGTATCAACACCTGCACAACGAACCAGCTGCTGAACTACTGCAATTAG
CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTATGTGAACGGAATGGCTTCTACAGACCCAACGACGGCATCGTGGACCAGTGCTGTATCAACACCTGCACAACGAACCAGCTGCTGAACTACTGCAATTAG


In [None]:
alignments = pairwise2.align.globalxx(seq_1,seq_2)


In [None]:
from Bio.pairwise2 import format_alignment
print(format_alignment(*alignments[0]))

CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTATGTGAACGGAATGGCTTCTACAGACCCAACGACGGCATCGTGGACCAGTGCTGTATCAACACCTGCACAACGAACCAGCTGCTGAACTACTGCAATTAG
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTATGTGAACGGAATGGCTTCTACAGACCCAACGACGGCATCGTGGACCAGTGCTGTATCAACACCTGCACAACGAACCAGCTGCTGAACTACTGCAATTAG
  Score=153



In [None]:
# Import libraries
from Bio import Align
from Bio.Seq import Seq

# Creating sample sequences
seq11 = Seq("TGTGACTA")
seq12 = Seq("CATGGTCA")

# Calling method
aligner = Align.PairwiseAligner()


# Finding similarities
alignments = aligner.align(seq_1, seq_2)

# Showing results
for alignment in alignments:
	print(alignment)


CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTATGTGAACGGAATGGCTTCTACAGACCCAACGACGGCATCGTGGACCAGTGCTGTATCAACACCTGCACAACGAACCAGCTGCTGAACTACTGCAATTAG
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CTCGTCAGCCGGCGCCTGTGCGGCTCTAACTTAGTGGATGCGCTGTTCTTGCTATGTGAACGGAATGGCTTCTACAGACCCAACGACGGCATCGTGGACCAGTGCTGTATCAACACCTGCACAACGAACCAGCTGCTGAACTACTGCAATTAG

