forked from biopython/biopython
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tutorial.tex
239 lines (227 loc) · 14.4 KB
/
Tutorial.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
% This is the main LaTeX file which is used to produce the Biopython
% Tutorial documentation.
%
% If you just want to read the documentation, you can pick up ready-to-go
% copies in both pdf and html format from:
%
% http://biopython.org/DIST/docs/tutorial/Tutorial.html
% http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
%
% If you want to typeset the documentation, you'll need a standard TeX/LaTeX
% distribution (I use teTeX, which works great for me on Unix platforms).
% Additionally, you need HeVeA (or at least hevea.sty), which can be
% found at:
%
% http://pauillac.inria.fr/~maranget/hevea/index.html
%
% You will also need the pictures included in the document, some of
% which are UMLish diagrams created by Dia
% (http://www.lysator.liu.se/~alla/dia/dia.html).
% These diagrams are available from Biopython git in the original dia
% format, which you can easily save as .png format using Dia itself.
% They are also checked in as the png files, so if you make
% modifications to the original dia files, the png files should also be
% changed.
%
% Once you're all set, you should be able to generate pdf by running:
%
% pdflatex Tutorial.tex (to generate the first draft)
% pdflatex Tutorial.tex (to get the cross references right)
% pdflatex Tutorial.tex (to get the table of contents right)
%
% To generate the html, you'll need HeVeA installed. You should be
% able to just run:
%
% hevea -fix Tutorial.tex
%
% However, on older versions of hevea you may first need to remove the
% Tutorial.aux file generated by LaTeX, then run hevea twice to get
% the references right.
%
% If you want to typeset this and have problems, please get in touch via
% the mailing list or GitHub, and we'll try to get things resolved. We
% always love to have people interested in the documentation!
\documentclass{report}
\usepackage{url}
\usepackage{fullpage}
\usepackage{hevea}
\usepackage{graphicx}
% For syntax coloring of python, pycon, bash etc in pdflatex:
\usepackage{minted}
% Minted fails on hevea, https://github.com/gpoore/minted/issues/234
% silently fall back on verbatim - ignore the language argument:
%HEVEA \newenvironment{minted}[1]{\verbatim}{\endverbatim}
% make everything have section numbers
\setcounter{secnumdepth}{4}
% Make links between references
\usepackage{hyperref}
\newif\ifpdf
\ifx\pdfoutput\undefined
\pdffalse
\else
\pdfoutput=1
\pdftrue
\fi
\ifpdf
\hypersetup{colorlinks=true, hyperindex=true, citecolor=red, urlcolor=blue}
\fi
\begin{document}
\begin{htmlonly}
\title{Biopython Tutorial and Cookbook}
\end{htmlonly}
\begin{latexonly}
\title{
%Hack to get the logo on the PDF front page:
\includegraphics[width=\textwidth]{images/biopython_logo.pdf}\\
%Hack to get some white space using a blank line:
~\\
Biopython Tutorial and Cookbook}
\end{latexonly}
\author{Jeff Chang, Brad Chapman, Iddo Friedberg, Thomas Hamelryck, \\
Michiel de Hoon, Peter Cock, Tiago Antao, Eric Talevich, Bartek Wilczy\'{n}ski}
\date{Last Update -- 20 December 2019 (Biopython 1.77.dev0)}
%Hack to get the logo at the start of the HTML front page:
%(hopefully this isn't going to be too wide for most people)
\begin{rawhtml}
<P ALIGN="center">
<IMG ALIGN="center" SRC="images/biopython_logo.svg" TITLE="Biopython Logo" ALT="[Biopython Logo]" width="450" height="300" />
</p>
\end{rawhtml}
\maketitle
\tableofcontents
\include{Tutorial/chapter_introduction}
\include{Tutorial/chapter_quick_start}
\include{Tutorial/chapter_seq_objects}
\include{Tutorial/chapter_seq_annot}
\include{Tutorial/chapter_seqio}
\include{Tutorial/chapter_align}
\include{Tutorial/chapter_blast}
\include{Tutorial/chapter_searchio}
\include{Tutorial/chapter_entrez}
\include{Tutorial/chapter_uniprot}
\include{Tutorial/chapter_pdb}
\include{Tutorial/chapter_popgen}
\include{Tutorial/chapter_phylo}
\include{Tutorial/chapter_motifs}
\include{Tutorial/chapter_cluster}
\include{Tutorial/chapter_learning}
\include{Tutorial/chapter_graphics}
\include{Tutorial/chapter_kegg}
\include{Tutorial/chapter_phenotype}
\include{Tutorial/chapter_cookbook}
\include{Tutorial/chapter_testing}
\include{Tutorial/chapter_advanced}
\include{Tutorial/chapter_contributing}
\include{Tutorial/chapter_appendix}
\begin{thebibliography}{99}
\bibitem{cock2009}
Peter J. A. Cock, Tiago Antao, Jeffrey T. Chang, Brad A. Chapman, Cymon J. Cox, Andrew Dalke, Iddo Friedberg, Thomas Hamelryck, Frank Kauff, Bartek Wilczynski, Michiel J. L. de Hoon: ``Biopython: freely available Python tools for computational molecular biology and bioinformatics''. {\it Bioinformatics} {\bf 25} (11), 1422--1423 (2009).
\url{https://doi.org/10.1093/bioinformatics/btp163}
\bibitem{pritchard2006}
Leighton Pritchard, Jennifer A. White, Paul R.J. Birch, Ian K. Toth: ``GenomeDiagram: a python package for the visualization of large-scale genomic data''. {\it Bioinformatics} {\bf 22} (5): 616--617 (2006).
\url{https://doi.org/10.1093/bioinformatics/btk021}
\bibitem{toth2006}
Ian K. Toth, Leighton Pritchard, Paul R. J. Birch: ``Comparative genomics reveals what makes an enterobacterial plant pathogen''. {\it Annual Review of Phytopathology} {\bf 44}: 305--336 (2006).
\url{https://doi.org/10.1146/annurev.phyto.44.070505.143444}
\bibitem{vanderauwera2009}
G\'eraldine A. van der Auwera, Jaroslaw E. Kr\'ol, Haruo Suzuki, Brian Foster, Rob van Houdt, Celeste J. Brown, Max Mergeay, Eva M. Top: ``Plasmids captured in C. metallidurans CH34: defining the PromA family of broad-host-range plasmids''.
\textit{Antonie van Leeuwenhoek} {\bf 96} (2): 193--204 (2009).
\url{https://doi.org/10.1007/s10482-009-9316-9}
\bibitem{proux2002}
Caroline Proux, Douwe van Sinderen, Juan Suarez, Pilar Garcia, Victor Ladero, Gerald F. Fitzgerald, Frank Desiere, Harald Br\"ussow:
``The dilemma of phage taxonomy illustrated by comparative genomics of Sfi21-Like Siphoviridae in lactic acid bacteria''. \textit{Journal of Bacteriology} {\bf 184} (21): 6026--6036 (2002).
\url{https://doi.org/10.1128/JB.184.21.6026-6036.2002}
\bibitem{jupe2012}
Florian Jupe, Leighton Pritchard, Graham J. Etherington, Katrin MacKenzie, Peter JA Cock, Frank Wright, Sanjeev Kumar Sharma1, Dan Bolser, Glenn J Bryan, Jonathan DG Jones, Ingo Hein: ``Identification and localisation of the NB-LRR gene family within the potato genome''. \textit{BMC Genomics} {\bf 13}: 75 (2012).
\url{https://doi.org/10.1186/1471-2164-13-75}
\bibitem{cock2010}
Peter J. A. Cock, Christopher J. Fields, Naohisa Goto, Michael L. Heuer, Peter M. Rice: ``The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants''. \textit{Nucleic Acids Research} {\bf 38} (6): 1767--1771 (2010). \url{https://doi.org/10.1093/nar/gkp1137}
\bibitem{brown1999}
Patrick O. Brown, David Botstein: ``Exploring the new world of the genome with DNA microarrays''. \textit{Nature Genetics} {\bf 21} (Supplement 1), 33--37 (1999). \url{https://doi.org/10.1038/4462}
\bibitem{talevich2012}
Eric Talevich, Brandon M. Invergo, Peter J.A. Cock, Brad A. Chapman: ``Bio.Phylo: A unified toolkit for processing, analyzing and visualizing phylogenetic trees in Biopython''. \textit{BMC Bioinformatics} {\bf 13}: 209 (2012).
\url{https://doi.org/10.1186/1471-2105-13-209}
\bibitem{cornish1985}
Athel Cornish-Bowden: ``Nomenclature for incompletely specified bases in nucleic acid sequences: Recommendations 1984.'' \textit{Nucleic Acids Research} {\bf 13} (9): 3021--3030 (1985).
\url{https://doi.org/10.1093/nar/13.9.3021}
\bibitem{cavener1987}
Douglas R. Cavener: ``Comparison of the consensus sequence flanking translational start sites in Drosophila and vertebrates.'' \textit{Nucleic Acids Research} {\bf 15} (4): 1353--1361 (1987).
\url{https://doi.org/10.1093/nar/15.4.1353}
\bibitem{bailey1994}
Timothy L. Bailey and Charles Elkan: ``Fitting a mixture model by expectation maximization to discover motifs in biopolymers'', \textit{Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology} 28--36. AAAI Press, Menlo Park, California (1994).
\bibitem{chapman2000}
Brad Chapman and Jeff Chang: ``Biopython: Python tools for computational biology''. \textit{ACM SIGBIO Newsletter} {\bf 20} (2): 15--19 (August 2000).
\bibitem{dayhoff1978}
M.O. Dayhoff, R.M. Schwartz, and B.C. Orcutt: ``A Model of Evolutionary Change in Proteins.'' \textit{Atlas of Protein Sequence and Structure}, Volume 5, Supplement 3, 1978: 345--352. The National Biomedical Research Foundation, 1979.
\bibitem{dehoon2004}
Michiel J. L. de Hoon, Seiya Imoto, John Nolan, Satoru Miyano: ``Open source clustering software''. \textit{Bioinformatics} {\bf 20} (9): 1453--1454 (2004).
\url{https://doi.org/10.1093/bioinformatics/bth078}
\bibitem{durbin1998}
Richard Durbin, Sean R. Eddy, Anders Krogh, Graeme Mitchison:
``Biological sequence analysis: Probabilistic models of proteins and nucleic acids''.
Cambridge University Press, Cambridge, UK (1998).
\bibitem{eisen1998}
Michiel B. Eisen, Paul T. Spellman, Patrick O. Brown, David Botstein: ``Cluster analysis and display of genome-wide expression patterns''. \textit{Proceedings of the National Academy of Science USA} {\bf 95} (25): 14863--14868 (1998). \url{https://doi.org/10.1073/pnas.96.19.10943-c}
\bibitem{golub1971}
Gene H. Golub, Christian Reinsch: ``Singular value decomposition and least squares solutions''. In \textit{Handbook for Automatic Computation}, {\bf 2}, (Linear Algebra) (J. H. Wilkinson and C. Reinsch, eds), 134--151. New York: Springer-Verlag (1971).
\bibitem{golub1989}
Gene H. Golub, Charles F. Van Loan: \textit{Matrix computations}, 2nd edition (1989).
\bibitem{hamelryck2003a}
Thomas Hamelryck and Bernard Manderick: 11PDB parser and structure class
implemented in Python''. \textit{Bioinformatics}, \textbf{19} (17): 2308--2310 (2003) \url{https://doi.org/10.1093/bioinformatics/btg299}.
\bibitem{hamelryck2003b}
Thomas Hamelryck: ``Efficient identification of side-chain patterns using a multidimensional index tree''. \textit{Proteins} {\bf 51} (1): 96--108 (2003).
\url{https://doi.org/10.1002/prot.10338}
\bibitem{hamelryck2005}
Thomas Hamelryck: ``An amino acid has two sides; A new 2D measure provides a different view of solvent exposure''. \textit{Proteins} {\bf 59} (1): 29--48 (2005).
\url{https://doi.org/10.1002/prot.20379}.
\bibitem{hartigan1975}
John A. Hartiga. \textit{Clustering algorithms}. New York: Wiley (1975).
\bibitem{henikoff1992}
Steven Henikoff, Jorja G. Henikoff: ``Amino acid substitution matrices from protein blocks.'' \textit{Proceedings of the National Academy of Sciences USA} {\bf 89} (2): 10915--10919 (1992). \url{https://doi.org/10.1073/pnas.89.22.10915}.
\bibitem{hihara2001}
Yukako Hihara, Ayako Kamei, Minoru Kanehisa, Aaron Kaplan and Masahiko Ikeuchi: ``DNA microarray analysis of cyanobacterial gene expression during acclimation to high light''. \textit{Plant Cell} {\bf 13} (4): 793--806 (2001). \url{https://doi.org/10.1105/tpc.13.4.793}.
\bibitem{altschul1990}
Stephen F. Altschul, Warren Gish, Webb Miller, Eugene W. Myers, David J. Lipman: ``Basic Local Alignment Search Tool''. \textit{Journal of Molecular Biology} {\bf 215} (3): 403--410 (1990). \url{https://doi.org/10.1016/S0022-2836%2805%2980360-2}.
\bibitem{jain1988}
Anil L. Jain, Richard C. Dubes: \textit{Algorithms for clustering data}. Englewood Cliffs, N.J.: Prentice Hall (1988).
\bibitem{kachitvichyanukul1988}
Voratas Kachitvichyanukul, Bruce W. Schmeiser: Binomial Random Variate Generation. \textit{Communications of the ACM} {\bf 31} (2): 216--222 (1988). \url{https://doi.org/10.1145/42372.42381}
\bibitem{kent2002}
W. James Kent: ``BLAT --- The BLAST-Like Alignment Tool''. \textit{Genome Research} {\bf 12}: 656--664 (2002). \url{https://doi.org/10.1101/gr.229202}
\bibitem{kohonen1997}
Teuvo Kohonen: ``Self-organizing maps'', 2nd Edition. Berlin; New York: Springer-Verlag (1997).
\bibitem{lecuyer1988}
Pierre L'Ecuyer: ``Efficient and Portable Combined Random Number Generators.''
\textit{Communications of the ACM} {\bf 31} (6): 742--749,774 (1988).
\url{https://doi.org/10.1145/62959.62969}
\bibitem{majumdar2005}
Indraneel Majumdar, S. Sri Krishna, Nick V. Grishin: ``PALSSE: A program to delineate linear secondary structural elements from protein structures.'' \textit{BMC Bioinformatics}, {\bf 6}: 202 (2005).
\url{https://doi.org/10.1186/1471-2105-6-202}.
\bibitem{matys2003}
V. Matys, E. Fricke, R. Geffers, E. G\"ossling, M. Haubrock, R. Hehl, K. Hornischer, D. Karas, A.E. Kel, O.V. Kel-Margoulis, D.U. Kloos, S. Land, B. Lewicki-Potapov, H. Michael, R. M\"unch, I. Reuter, S. Rotert, H. Saxel, M. Scheer, S. Thiele, E. Wingender E: ``TRANSFAC: transcriptional regulation, from patterns to profiles.'' Nucleic Acids Research {\bf 31} (1): 374--378 (2003).
\url{https://doi.org/10.1093/nar/gkg108}
\bibitem{saldanha2004}
Alok Saldanha: ``Java Treeview---extensible visualization of microarray data''. \textit{Bioinformatics} {\bf 20} (17): 3246--3248 (2004).
\url{https://doi.org/10.1093/bioinformatics/bth349}
\bibitem{schneider2005}
Adrian Schneider, Gina M. Cannarozzi, and Gaston H. Gonnet: ``Empirical codon substitution matrix''. \textit{BMC Bioinformatics} {\bf 6}: 134 (2005).
\url{https://doi.org/10.1186/1471-2105-6-134}
\bibitem{sibson1973}
Robin Sibson: ``SLINK: An optimally efficient algorithm for the single-link cluster method''. \textit{The Computer Journal} {\bf 16} (1): 30--34 (1973).
\url{https://doi.org/10.1093/comjnl/16.1.30}
\bibitem{snedecor1989}
George W. Snedecor, William G. Cochran: \textit{Statistical methods}. Ames, Iowa: Iowa State University Press (1989).
\bibitem{tamayo1999}
Pablo Tamayo, Donna Slonim, Jill Mesirov, Qing Zhu, Sutisak Kitareewan, Ethan Dmitrovsky, Eric S. Lander, Todd R. Golub: ``Interpreting patterns of gene expression with self-organizing maps: Methods and application to hematopoietic differentiation''. \textit{Proceedings of the National Academy of Science USA} {\bf 96} (6): 2907--2912 (1999). \url{https://doi.org/10.1073/pnas.96.6.2907}
\bibitem{tryon1970}
Robert C. Tryon, Daniel E. Bailey: \textit{Cluster analysis}. New York: McGraw-Hill (1970).
\bibitem{tukey1977}
John W. Tukey: ``Exploratory data analysis''. Reading, Mass.: Addison-Wesley Pub. Co. (1977).
\bibitem{waterman1987}
Michael S. Waterman, Mark Eggert: ``A new algorithm for best subsequence alignments with application to tRNA-rRNA comparisons'', \textit{Journal of Molecular Biology} {\bf 197} (4): 723--728 (1987). \url{https://doi.org/10.1016/0022-2836(87)90478-5}
\bibitem{yeung2001}
Ka Yee Yeung, Walter L. Ruzzo: ``Principal Component Analysis for clustering gene expression data''. \textit{Bioinformatics} {\bf 17} (9): 763--774 (2001).
\url{https://doi.org/10.1093/bioinformatics/17.9.763}
\end{thebibliography}
\end{document}