In [10]:
import pandas as pd
from Bio import SeqIO
from Bio import Restriction
from ntaxon.nucleotide import Sequence as Seq
from Bio import AlignIO

# Get accessions from fasta

In [4]:
fasta = "data/isolate_sequences_corrected.fasta"
seq_list = []
for record in SeqIO.parse(fasta, "fasta"):
    seq_list.append([record.id, str(record.seq)])
accessions = pd.DataFrame(data=seq_list, columns=['sample', 'sequence'])
accessions.head()

Unnamed: 0,sample,sequence
0,MN493874.1,acatgcaagtcgaacggcacggagagcttgctctcggtggcgagtg...
1,MN493875.1,acatgcaagtcgaacgaactccggagttagtggcggacgggtgagt...
2,MN493876.1,acatgcaagtcgaacggcacggagagcttgctctcggtggcgagtg...
3,MN493877.1,ccatgcaagtcgagcgccccaaggggagcggcagacgggtgagtaa...
4,MN493878.1,ccatgcaagtcgagcgccccaaggggagcggcagacgggtgagtaa...


# Get Accessions from Alignment

In [14]:
alignment = AlignIO.read("data/alignments/cropped_aln.fasta", "fasta")
print(alignment)

Alignment with 72 rows and 1335 columns
GGTGGCGAGTGGCGGACGGGTGAGGAATACATCGGAATCTACTT...ACC MN493874.1
CGGAGTTAGTGGCGGACGGGTGAGTAACACGTGGGAACGTGCCT...ACC MN493875.1
GGTGGCGAGTGGCGGACGGGTGAGGAATACATCGGAATCTACTC...ACC MN493876.1
CAAGGGGAGCGGCAGACGGGTGAGTAACGCGTGGGAACGTACCT...ACC MN493877.1
CAAGGGGAGCGGCAGACGGGTGAGTAACGCGTGGGAACGTACCT...ACC MN493878.1
CGACGTTAGCGGCGGACGGGTGAGTAACACGTGGCAACCTACCT...ACC MN493879.1
GGTGGCGAGTGGCGGACGGGGGAGGAATACATCGGAATCTACCT...ACC MN493880.1
GGCGGCGAGTGGCGGACGGGTGAGTAATATATCGGAACGTGCCC...ACC MN493881.1
GGCGGCGAGTGGCGGACGGGTGAGTAATATATCGGAACGTGCCC...ACC MN493882.1
TGATGTTAGCGGCGGATGGGTGAGTAACACGTGGCAACCTGCCC...ACC MN493883.1
GATGGTTAGCGGCGGACGGGTGAGTAACACGTAGCAACCTGCCC...ACC MN493884.1
TGATGTTAGCGGCGGACGGGTGAGTAACACGTGGTAACCTGCCT...ACC MN493885.1
GGCGGCGAGTGGCGGACGGGTGAGTAATATATCGGAACGTGCCC...ACC MN493886.1
GGCGGCGAGTGGCGGACGGGTGAGTAATATATCGGAACGTGCCC...ACC MN493887.1
TGATATTAGCGGCGGACGGGTGAGTAACACGTGGCAACCTGCCC...ACC MN493888.1
TGATATTAGCGGCGGACGGGTGAGTAACAC

In [16]:
seq_list = []
for record in alignment:
    seq_list.append([record.id, str(record.seq.ungap('-'))])
accessions = pd.DataFrame(data=seq_list, columns=['sample', 'sequence'])
accessions.head()

Unnamed: 0,sample,sequence
0,MN493874.1,GGTGGCGAGTGGCGGACGGGTGAGGAATACATCGGAATCTACTTTT...
1,MN493875.1,CGGAGTTAGTGGCGGACGGGTGAGTAACACGTGGGAACGTGCCTTT...
2,MN493876.1,GGTGGCGAGTGGCGGACGGGTGAGGAATACATCGGAATCTACTCTG...
3,MN493877.1,CAAGGGGAGCGGCAGACGGGTGAGTAACGCGTGGGAACGTACCTTT...
4,MN493878.1,CAAGGGGAGCGGCAGACGGGTGAGTAACGCGTGGGAACGTACCTTT...


In [17]:
r_map_df = pd.DataFrame(columns=['sample', 'restriction_loc'])
for i, r in accessions.iterrows():
    s = Seq(r['sequence'])
    r_maps = s.restriction_search(Restriction.MspI)
    for m in r_maps:
        r_map_df = r_map_df.append({
            'sample': r['sample'],
            'restriction_loc': m
        }, ignore_index=True)

r_map_df.head()

Unnamed: 0,sample,restriction_loc
0,MN493874.1,377
1,MN493874.1,410
2,MN493874.1,1067
3,MN493874.1,1078
4,MN493874.1,1169
