In [1]:
#Check out this website for biopython alignment modules and commands to implement:
#http://mattshirley.com/uploads/2014/04/sequence_alignment_biopython.slides.html#/
import Bio
import Bio.SeqIO
import Bio.pairwise2 as pw2
from Bio.pairwise2 import format_alignment
from Bio.Align.Applications import MuscleCommandline
import glob
from Bio import AlignIO

In [83]:
Gaff = "ATGACTCCTCTTATTTACTCCACCCTA" #Gaffinis nd2 sequence excerpt
Pmex = "ATGGCTCCCCTAATCTACTCCGCCCTC" #Pmexicana nd2 sequence excerpt
Xmac = "ATGGCTCCCTTTGTTTACTCCACCCTT" #Xmac nd2 seq excerpt

In [84]:
print("len(Gaff) =",len(Gaff))
print("len(Pmex) =",len(Pmex))
print("len(Xmac) =", len(Xmac))

len(Gaff) = 27
len(Pmex) = 27
len(Xmac) = 27


In [85]:
Gaff_Xmac_align = pw2.align.localms(Gaff, Xmac, 2, -1, -1, -0.5)
Pmex_Xmac_align = pw2.align.localms(Pmex, Xmac, 2, -1, -1, -0.5)

In [86]:
for i in Gaff_Xmac_align:
    print(format_alignment(*i))

ATGACTCCTCTTA-TTTACTCCACCCTA
|||.|||| |||. |||||||||||||
ATGGCTCC-CTTTGTTTACTCCACCCTT
  Score=42

ATGACTCCTCTT-ATTTACTCCACCCTA
|||.|||| ||| .|||||||||||||
ATGGCTCC-CTTTGTTTACTCCACCCTT
  Score=42

ATGACTCCTCT-TATTTACTCCACCCTA
|||.|||| || |.|||||||||||||
ATGGCTCC-CTTTGTTTACTCCACCCTT
  Score=42

ATGACTCCTC-TTATTTACTCCACCCTA
|||.|||| | ||.|||||||||||||
ATGGCTCC-CTTTGTTTACTCCACCCTT
  Score=42

ATGACTCC-TCTTATTTACTCCACCCTA
|||.|||| | ||.|||||||||||||
ATGGCTCCCT-TTGTTTACTCCACCCTT
  Score=42

ATGACTC-CTCTTATTTACTCCACCCTA
|||.||| || ||.|||||||||||||
ATGGCTCCCT-TTGTTTACTCCACCCTT
  Score=42

ATGACT-CCTCTTATTTACTCCACCCTA
|||.|| ||| ||.|||||||||||||
ATGGCTCCCT-TTGTTTACTCCACCCTT
  Score=42



In [87]:
for j in Pmex_Xmac_align:
    print(format_alignment(*j))

ATGGCTCCCCTAATCTACTCCGCCCTC
|||||||||.|..|.||||||.||||
ATGGCTCCCTTTGTTTACTCCACCCTT
  Score=37



In [88]:
# Read the README at this website for more info on SeqIO: https://github.com/peterjc/biopython_workshop

#Find length of input sequences with SeqIO.parse()

glob.os.chdir("/Users/johncoffin/Downloads/") #set working directory

from Bio import SeqIO
filename = "gaff_nd2.fasta"
for record in SeqIO.parse(filename, "fasta"):
    print("Record " + record.id + ", length " + str(len(record.seq)))

Record NC_004388.1:4027-5073, length 1047


In [89]:
record = SeqIO.read(filename, "fasta")
print("Record ID is",record.id, "and it is", len(record), "bases long")



Record ID is NC_004388.1:4027-5073 and it is 1047 bases long


In [90]:
#Translate sequence
glob.os.chdir("/Users/johncoffin/Downloads/")
record = SeqIO.read("pmex_nd2.gb", "genbank")
cds = record.features[2]
print("locus tag:",cds.qualifiers["locus_tag"])
print(cds.qualifiers["translation"])
print(record.features[1])

locus tag: ['CGV34_mgp12']
['MAPLIYSALIISLGLGTTMTFASTHWYLAWMGIEINTLAIIPLMAQNHIPRAIEATTKYFFVQATASATLLFAGISNAFLTGQWDITYTPYTLTSTLITLALAMKIGLAPLHSWMPEVMQGLNLLTGLILSTWQKLAPLYLIYQIQPNNPNIFIALGLMSIIVGGWGGFNQVQLRKILAYSSIAHLGWMILILSFSPPLALLTIIIYILMTFSLFSSFMLTRTTHINSLSTTWAKIPILTISTPLILLSLGGLPPLTGFMPKWMILQELTKQSLCPLATMAALSSLFSLYFYLRLSYAMTLTMPPNNPAGTLPWRLNPRHNTLPLALTTTSTICLLPMTPAIMSLMPF']
type: gene
location: [0:1045](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:33409441']
    Key: gene, Value: ['ND2']
    Key: locus_tag, Value: ['CGV34_mgp12']



In [91]:
glob.os.chdir("/Users/johncoffin/Desktop/Coding/project/")

from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
import phylopandas as ph #had to download phylopandas with pip install phylopandas and pandas_flavor and dendropy

align = MultipleSeqAlignment([
             SeqRecord(Seq(Gaff, generic_dna), id="Gaff"),
             SeqRecord(Seq(Pmex, generic_dna), id="Pmex"),
             SeqRecord(Seq(Xmac, generic_dna), id="Xmac"),
         ])

In [92]:
print(align)

DNAAlphabet() alignment with 3 rows and 27 columns
ATGACTCCTCTTATTTACTCCACCCTA Gaff
ATGGCTCCCCTAATCTACTCCGCCCTC Pmex
ATGGCTCCCTTTGTTTACTCCACCCTT Xmac


In [93]:
AlignIO.write(align, "align.phy", "phylip")

1

In [94]:
alignment = ph.read_phylip("align.phy")

#test_align = Gaff_Xmac_align[0]
#Gaffinis = test_align[0]
#Pmexicana = test_align[1]
#print(Gaffinis)
#print(Pmexicana)

In [95]:
alignment

Unnamed: 0,id,sequence,description,label,uid
0,Gaff,ATGACTCCTCTTATTTACTCCACCCTA,Gaff,Gaff,cVjUXce9GN
1,Pmex,ATGGCTCCCCTAATCTACTCCGCCCTC,Pmex,Pmex,mZb2cBsqGC
2,Xmac,ATGGCTCCCTTTGTTTACTCCACCCTT,Xmac,Xmac,TenzCIet7T


In [96]:
Gaffinis = str(alignment.sequence[0:1])
print(Gaffinis)
affinis = Seq(Gaffinis)
Gaff_prot = affinis.translate()

Ref = alignment.sequence[2:3]
print(Ref)


0    ATGACTCCTCTTATTTACTCCACCCTA
Name: sequence, dtype: object


TranslationError: Codon '0  ' is invalid

In [97]:
Gaff_prot

NameError: name 'Gaff_prot' is not defined

In [76]:
matches = []
variants = []
#for nt in Gaffinis:
 #   for nuc in Ref:
  #      print(nt, nuc)
   #     if nt == nuc:
    #        matches += nt
    #    else:
     #       variants += nt
        
        
for i in range(1, len(str(Gaffinis))):
    #print(i)
    for j in range(1, len(str(Ref))):
        print(i)
        #if Gaffinis[i] == Ref[j]:
           # matches += Ref[j]
       # else:
           # variants += Ref[j]
        
        
        
        
#    for j in Ref:
 #       print(i[0])
  #      print(j[])
   #     if i == j:
     #       matches += i
      #  else:
       #     variants += j

#Gaffinis


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8


In [69]:
x = str(Gaffinis)
x
len(x)
range(1, len(Gaffinis))

range(1, 1)

In [54]:
variants

['A',
 'T',
 'G',
 'A',
 'C',
 'T',
 'C',
 'C',
 'T',
 'C',
 'T',
 'T',
 'A',
 'T',
 'T',
 'T',
 'A',
 'C',
 'T',
 'C',
 'C',
 'A',
 'C',
 'C',
 'C',
 'T',
 'A',
 'A',
 'T']