# biopython包简介


Biopython工程是一个使用Python来开发计算分子生物学工具的国际团体。(http://www.python.org) 

Biopython官网(http://www.biopython.org) 为使用和研究生物信息学的开发者提供了一个在线的资源库，包括模块、脚本以及一些基于Python的软件的网站链接。一般来讲，Biopython致力于通过创造高质量的和可重复利用的模块及类，从而使得Python在生物信息学中的应用变得更加容易。Biopython的特点包括解析各种生物信息学格式的文件(BLAST，Clustalw，FASTA，Genbank...)，访问在线的服务器(NCBI，Expasy...)，常见和不那么常见程序的接口(Clustalw， DSSP，MSMS...)，标准的序列类，各种收集的模块，KD树数据结构等等，还有一些文档。

基本来说，我们喜欢使用Python来编程，并且希望通过创建高质量、可复用的模块和脚本来使得Python在生物信息学中的应用变得容易。

## 使用biopython在pubmed进行查询

In [4]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import Entrez

In [5]:
# NOTE:
# Please change to your email address.
# NCBI uses this for there logging of the resources use
Entrez.email = 'shenxt@stanford.edu'
with Entrez.einfo() as handle:
    record = Entrez.read(handle)

print(record['DbList'])

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'sparcle', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'probe', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']


In [6]:
# search NCBI Pubmed with query as you would do from their web interface
with Entrez.esearch(db='pubmed', term='church g[auth] AND cell[journal]') as handle:
    records = Entrez.read(handle)
    print(records)
    print('-------------')
    for r_key in records:
        print(r_key, ': ', records[r_key])

{'Count': '3', 'RetMax': '3', 'RetStart': '0', 'IdList': ['29677511', '11081629', '229970'], 'TranslationSet': [{'From': 'cell[journal]', 'To': '"Cell"[Journal]'}], 'TranslationStack': [{'Term': 'church g[Author]', 'Field': 'Author', 'Count': '546', 'Explode': 'N'}, {'Term': '"Cell"[Journal]', 'Field': 'Journal', 'Count': '20228', 'Explode': 'N'}, 'AND'], 'QueryTranslation': 'church g[Author] AND "Cell"[Journal]'}
-------------
Count :  3
RetMax :  3
RetStart :  0
IdList :  ['29677511', '11081629', '229970']
TranslationSet :  [{'From': 'cell[journal]', 'To': '"Cell"[Journal]'}]
TranslationStack :  [{'Term': 'church g[Author]', 'Field': 'Author', 'Count': '546', 'Explode': 'N'}, {'Term': '"Cell"[Journal]', 'Field': 'Journal', 'Count': '20228', 'Explode': 'N'}, 'AND']
QueryTranslation :  church g[Author] AND "Cell"[Journal]


In [7]:
# fetch PMIDs returned from search above
pubmed_records = len(records['IdList'])
print('number of records:', pubmed_records)

number of records: 3


In [8]:
# retrieve papers from search above
with Entrez.efetch(db='pubmed', id=records['IdList'][0], retmode='xml') as handle:
    data = Entrez.read(handle)
print(data)

{'PubmedArticle': [{'MedlineCitation': DictElement({'KeywordList': [ListElement([StringElement('AML', attributes={'MajorTopicYN': 'Y'}), StringElement('AXL/GAS6', attributes={'MajorTopicYN': 'Y'}), StringElement('CRISPR', attributes={'MajorTopicYN': 'Y'}), StringElement('CRISPRa', attributes={'MajorTopicYN': 'Y'}), StringElement('TEM', attributes={'MajorTopicYN': 'Y'}), StringElement('cancer', attributes={'MajorTopicYN': 'Y'}), StringElement('cytarabine', attributes={'MajorTopicYN': 'Y'}), StringElement('drug-resistance', attributes={'MajorTopicYN': 'Y'}), StringElement('leukemia', attributes={'MajorTopicYN': 'Y'}), StringElement('lncRNA', attributes={'MajorTopicYN': 'Y'})], attributes={'Owner': 'NOTNLM'})], 'OtherAbstract': [], 'SpaceFlightMission': [], 'OtherID': [], 'GeneralNote': [], 'CitationSubset': ['IM'], 'PMID': StringElement('29677511', attributes={'Version': '1'}), 'DateCompleted': {'Year': '2019', 'Month': '02', 'Day': '11'}, 'DateRevised': {'Year': '2019', 'Month': '04', '

In [9]:
# XML format is complex and takes time to work through to extract what you want
for i in range(pubmed_records):
    with Entrez.efetch(db='pubmed', id=records['IdList'][i], retmode='xml') as handle:
        data = Entrez.read(handle)
        for y in data['PubmedArticle'][0]['MedlineCitation']:
            #print(y)
            if y == 'PMID':
                print(y, ':', data['PubmedArticle'][0]['MedlineCitation'][y], '\n')
            if y == 'Article':
                for x in data['PubmedArticle'][0]['MedlineCitation'][y]:
                    #print(x)
                    if x in ['Journal', 'ArticleTitle', 'Abstract']:
                        print(x, ':', data['PubmedArticle'][0]['MedlineCitation']['Article'][x], '\n')

PMID : 29677511 

Journal : {'ISSN': StringElement('1097-4172', attributes={'IssnType': 'Electronic'}), 'JournalIssue': DictElement({'Volume': '173', 'Issue': '3', 'PubDate': {'Year': '2018', 'Month': '04', 'Day': '19'}}, attributes={'CitedMedium': 'Internet'}), 'Title': 'Cell', 'ISOAbbreviation': 'Cell'} 

ArticleTitle : An Integrated Genome-wide CRISPRa Approach to Functionalize lncRNAs in Drug Resistance. 

Abstract : {'AbstractText': ['Resistance to chemotherapy plays a significant role\xa0in cancer mortality. To identify genetic units affecting sensitivity to cytarabine, the mainstay of treatment for acute myeloid leukemia (AML), we\xa0developed a comprehensive and integrated genome-wide platform based on a dual protein-coding and non-coding integrated CRISPRa screening (DICaS). Putative resistance genes were initially identified using pharmacogenetic data from 760 human pan-cancer cell lines. Subsequently, genome scale functional characterization of both coding and long non-codin

In [10]:
# can search any of the NCBI databases listed in the einfo command above
with Entrez.esearch(db='snp', term='rs328') as handle:
    records = Entrez.read(handle)
    print(records)
    print('-------------')
    for r_key in records:
        print(r_key, ': ', records[r_key])

{'Count': '4', 'RetMax': '4', 'RetStart': '0', 'IdList': ['52834251', '17482566', '3735962', '328'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'rs328[All Fields]', 'Field': 'All Fields', 'Count': '4', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'rs328[All Fields]'}
-------------
Count :  4
RetMax :  4
RetStart :  0
IdList :  ['52834251', '17482566', '3735962', '328']
TranslationSet :  []
TranslationStack :  [{'Term': 'rs328[All Fields]', 'Field': 'All Fields', 'Count': '4', 'Explode': 'N'}, 'GROUP']
QueryTranslation :  rs328[All Fields]


In [12]:
# optional, can use this module to parse return XML
import xmltodict
# conda xmltodict

In [13]:
# https://www.ncbi.nlm.nih.gov/snp/rs328
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp&id=rs328&report=XML
with Entrez.efetch(db='snp', id='52834251', retmode='xml') as handle:
    #print(handle.read())
    snp_dict = xmltodict.parse(handle.read())
for i in snp_dict['ExchangeSet']['DocumentSummary']:
    print(i, ':', snp_dict['ExchangeSet']['DocumentSummary'][i])

@uid : 52834251
SNP_ID : 328
ALLELE_ORIGIN : None
GLOBAL_MAFS : OrderedDict([('MAF', [OrderedDict([('STUDY', '1000Genomes'), ('FREQ', 'G=0.092452/463')]), OrderedDict([('STUDY', 'ALSPAC'), ('FREQ', 'G=0.106642/411')]), OrderedDict([('STUDY', 'Estonian'), ('FREQ', 'G=0.066964/300')]), OrderedDict([('STUDY', 'ExAC'), ('FREQ', 'G=0.093501/11340')]), OrderedDict([('STUDY', 'GnomAD'), ('FREQ', 'G=0.086979/2728')]), OrderedDict([('STUDY', 'GnomAD_exomes'), ('FREQ', 'G=0.092156/23148')]), OrderedDict([('STUDY', 'NorthernSweden'), ('FREQ', 'G=0.085/51')]), OrderedDict([('STUDY', 'PAGE_STUDY'), ('FREQ', 'G=0.07904/6220')]), OrderedDict([('STUDY', 'TOPMED'), ('FREQ', 'G=0.089712/11265')]), OrderedDict([('STUDY', 'TWINSUK'), ('FREQ', 'G=0.107875/400')]), OrderedDict([('STUDY', 'Vietnamese'), ('FREQ', 'G=0.127036/78')])])])
GLOBAL_POPULATION : None
GLOBAL_SAMPLESIZE : 0
SUSPECTED : None
CLINICAL_SIGNIFICANCE : likely-benign,benign
GENES : OrderedDict([('GENE_E', OrderedDict([('NAME', 'LPL'), ('GEN

## biopython中的Bio.Seq

 biopython sequences    
 https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp

In [14]:
# class of objects for sequences
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPAC.unambiguous_dna)
my_seq

Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPACUnambiguousDNA())