In [15]:
from Bio import SeqIO
import os
import subprocess
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio import SubsMat

In [2]:
record = SeqIO.read( "P17102.gbk" , "genbank" )
print(record.id)
print(record.description)
print(record.name)
print(record.seq)

P17102.1
RecName: Full=Protein X; AltName: Full=HBx; AltName: Full=Peptide X; AltName: Full=pX
X_HBVA4
MATRLCCQLDPSRDVLCLRPVGAESRGRPLSGPLGTLSSPSPSAVPADHGAHLSLRGLPVCAFSSAGPCALRFTSARCMETTVNAHQILPKVLHKRTLGLPAMSTTDLEAYFKDCVFKDWEELGEEIRLKVFVLGGCRHKLVCAPAPCNFFTSA


In [3]:
record = SeqIO.read ( "P17102.fasta", "fasta" )
print(record.description)
print(record.seq)

sp|P17102|X_HBVA4 Protein X OS=Hepatitis B virus genotype A2 subtype adw2 (isolate Germany/991/1990) GN=X PE=3 SV=1
MATRLCCQLDPSRDVLCLRPVGAESRGRPLSGPLGTLSSPSPSAVPADHGAHLSLRGLPVCAFSSAGPCALRFTSARCMETTVNAHQILPKVLHKRTLGLPAMSTTDLEAYFKDCVFKDWEELGEEIRLKVFVLGGCRHKLVCAPAPCNFFTSA


In [5]:
records = []
for filename in os.listdir("LHBs"):
    handle = open("LHBs" + "/" + filename)
    record = SeqIO.read( handle, "swiss" )
    records.append ( record ) 

In [7]:
len(records)
SeqIO.write( records, "LHBs_variants.fasta", "fasta" )

44

In [9]:
subprocess.call(['clustalw', 'LHBs_variants.fasta'])

0

In [11]:
alignment = AlignIO.read( open("LHBs_variants.aln") , "clustal" )
print("Alignment length %i" % alignment.get_alignment_length())
print(print alignment[0])

In [13]:
summary_align = AlignInfo.SummaryInfo(alignment)
consensus = summary_align.dumb_consensus()
print(consensus)

MGGXSSXXRXGMGXNLSVPNPLGFFPDHQLDPAFXANSXNPDWDFNPXKDXWPXANXVGXGAFGPGFTPPHGGLLGWSPQAQGXLTTXPAXPPPASTNRQSGRQPTPXSPPLRDXHPQAMQWNSTXFHQXLXDPRVRGLYFPAGGSSSGTVNPXPXXASXISSIXSXTGDPAXNMENITSGXLGPLLVLQAGFFLLTXILTIPQSLDSWWTSLNFLGGXPXCXGQNSQSPTSNHSPTSCPPXCPGYRWMCLRRFIIFLFILLLCLIFLLVLLDYQGMLPVCPLXPGSXTTSTGPCXTCTTXAQGTSMFPSCCCTKPXDGNCTCIPIPSSWAFXKXLWEWASXRFSWLSLLVPFVQWFVGLSPTVWLSVIWMMWYWGPSLYXILSPFXPLLPIFFCLWVYI


In [14]:
replace_info = summary_align.replacement_dictionary()
print(replace_info[ ("A", "G") ])
print(replace_info[ ("A", "K") ])

957.0
10.0


In [16]:
my_arm = SubsMat.SeqMat(replace_info)
my_lom = SubsMat.make_log_odds_matrix(my_arm)
my_lom.print_full_mat()

   -   A   C   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   T   V   W   Y
-   5  -1  ND  ND   0  ND   0  -2  ND   2  -1   1  ND   0  -1   1   0  -1  -2   0  -2
A  -1   4  ND  -2  -1  ND  -1  -2  -4  -5  -3  -4  -3  -3  -4  -5  -2   0   1  -8  ND
C  ND  ND   4  ND  ND   0  -5  ND  ND  ND  -7  ND  ND  ND  ND  ND  -3  ND  ND  ND   0
D  ND  -2  ND   5   1  ND  -2  -4  -4  -4  ND  -2  -1 -10  -4  -6  -6  -4  -3  ND  ND
E   0  -1  ND   1   6  ND   0  -3  ND  -2  ND   0  -1  ND  -1  -2  -3  -5  -4  ND  ND
F  ND  ND   0  ND  ND   3  ND  -2  -6  ND  -1  ND  ND  ND  ND  ND  -3  ND  -7  ND   0
G   0  -1  -5  -2   0  ND   3  ND  ND  -2  ND  -3  -5  ND  -6  -1  -5  -6  -2  -5  ND
H  -2  -2  ND  -4  -3  -2  ND   5  -3  -3  -2  ND   0  -2  -1  -4  -4  -1  -5  -7  -7
I  ND  -4  ND  -4  ND  -6  ND  -3   4  -6  -1   0  -2  -8  ND  ND  -6   0  -2  ND  ND
K   2  -5  ND  -4  -2  ND  -2  -3  -6   5  -4  -2  -2  -4   0   2  -5   0  -8  ND  ND
L  -1  -3  -7  ND  ND  -1  ND  -2  -1  -4   2  -2  ND  