# **Name:** Hamza Shafiq

**Reach me at:** 
1. [Github](https://github.com/hamza811998)
2. [LinkedIn](https://www.linkedin.com/in/hamza811998/)
3. [Fiverr](https://www.fiverr.com/hamza_811998?up_rollout=true)
4. [Facebook](https://www.facebook.com/hamza811998/)
5. [Twitter](https://twitter.com/hamza811998)
6. [Instagram](https://www.instagram.com/hamza811998/)

**Email:** hamza811998shafiq@gmail.com

# `ACCESS BIOINFORMATICS DATABASES WITH BIO-PYTHON`

1. [NCBI](#1.-NCBI)<br>
    1.1. [Nucleotide BLAST](#1.1.-Nucleotide-BLAST)<br>
    1.2. [Protein BLAST](#1.2.-Protein-BLAST)
    
2. [ENTREZ](#2.-ENTREZ)<br>
    2.1. [PUBMED](#2.1.-PUBMED)<br>
    2.2. [Nucleotide](#2.2.-Nucleotide)
    
3. [PDB](#3.-PDB)

4. [EXPASY](#4.-EXPASY)<br>
    4.1. [PROSITE](#4.1.-PROSITE)<br>
    4.2. [ScanProsite](#4.2.-ScanProsite)
    
5. [KEGG](#5.-KEGG)

# 1. NCBI

### Import Modules

In [1]:
# pip install Bio
# pip install biopython

In [2]:
from Bio.Blast import NCBIWWW
from Bio import SeqIO, SearchIO

In [3]:
# help(NCBIWWW)

## 1.1. Nucleotide BLAST

In [4]:
nuc_record = SeqIO.read("nuc_seq.fasta", format = 'fasta')
len(nuc_record)

774

In [5]:
nuc_record.description

'MT598137.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/IRN/PN-2142-S/2020 surface glycoprotein (S) gene, partial cds'

In [6]:
nuc_record.seq

Seq('ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGAT...GGT')

In [7]:
result_handle = NCBIWWW.qblast("blastn", "nt", nuc_record.seq)
blast_reslut = SearchIO.read(result_handle, "blast-xml")



In [8]:
print(blast_reslut[0:2])

Program: blastn (2.14.1+)
  Query: No (774)
         definition line
 Target: nt
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  gi|2592184291|gb|OR671911.1|  Severe acute respiratory ...
            1      1  gi|2592184156|gb|OR671900.1|  Severe acute respiratory ...


In [9]:
Seq = blast_reslut[0]
print(f"Sequence ID: {Seq.id}")
print(f"Sequence Description: {Seq.description}")

details = Seq[0]
print(f"E-value: {details.evalue}")

Sequence ID: gi|2592184291|gb|OR671911.1|
Sequence Description: Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/CA-LACPHL-AY02643/2020, complete genome
E-value: 0.0


In [10]:
print(f"alignment:\n{details.aln}")

alignment:
Alignment with 2 rows and 774 columns
ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAA...GGT No
ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAA...GGT gi|2592184291|gb|OR671911.1|


## 1.2. Protein BLAST

In [11]:
prot_record = SeqIO.read("prot_seq.fasta", format="fasta")
len(prot_record)

258

In [12]:
prot_record.description

'QKO24044.1 surface glycoprotein, partial [Severe acute respiratory syndrome coronavirus 2]'

In [13]:
prot_record.seq

Seq('IAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKP...PIG')

In [14]:
result_handle = NCBIWWW.qblast("blastp", "pdb", prot_record.seq)
blast_result = SearchIO.read(result_handle, "blast-xml")

In [15]:
print(blast_result[0:2])

Program: blastp (2.14.1+)
  Query: unnamed (258)
         protein product
 Target: pdb
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  pdb|8ELJ|A  Chain A, Spike glycoprotein [Severe acute r...
            1      1  pdb|7CAB|A  Chain A, Spike glycoprotein [Severe acute r...


In [16]:
Seq = blast_result [0]
print(f"Sequence ID: {Seq.id}")
print(f"Sequence Description: {Seq.description}")

details = Seq[0]
print(f"E-value: {details.evalue}")

Sequence ID: pdb|8ELJ|A
Sequence Description: Chain A, Spike glycoprotein [Severe acute respiratory syndrome coronavirus 2]
E-value: 0.0


In [17]:
print(f"alignment:\n {details.aln}")

alignment:
 Alignment with 2 rows and 258 columns
IAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLY...PIG unnamed
IAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLY...PIG pdb|8ELJ|A


------------------------------------------------------

# 2. ENTREZ

### Import Modules

In [18]:
from Bio import Entrez

In [19]:
# help(Entrez)

In [20]:
# Entrez.email = "datacyclopes@gmail.com"
Entrez.email = "hamza811998shafiq@gmail.com"

In [21]:
handle = Entrez.einfo()
record = Entrez.read(handle)
record["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

## 2.1. PUBMED

In [22]:
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)
record["DbInfo"]["Description"]

'PubMed bibliographic record'

In [23]:
record["DbInfo"]["Count"]

'36403977'

In [24]:
handle = Entrez.esearch(db="pubmed", term="biophython")
record = Entrez.read(handle)
record["IdList"]

[]

In [25]:
handle = Entrez.esummary(db="pubmed", id='33242467, 32044951')
records = Entrez.parse(handle)


for record in records:
    print(record['AuthorList'], record["Title"], record["PubDate"], record["FullJournalName"])

['Kricka LJ', 'Cornish TC', 'Park JY'] Eponyms in clinical chemistry. 2021 Jan Clinica chimica acta; international journal of clinical chemistry
['Ireland SM', 'Martin ACR'] atomium-a Python structure parser. 2020 May 1 Bioinformatics (Oxford, England)


In [26]:
handle = Entrez.efetch(db="pubmed", id="19811691")
print(handle.read())

b'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM"><PMID Version="1">19811691</PMID><DateCompleted><Year>2010</Year><Month>02</Month><Day>12</Day></DateCompleted><DateRevised><Year>2021</Year><Month>10</Month><Day>20</Day></DateRevised><Article PubModel="Electronic"><Journal><ISSN IssnType="Electronic">1471-2105</ISSN><JournalIssue CitedMedium="Internet"><Volume>10 Suppl 11</Volume><Issue>Suppl 11</Issue><PubDate><Year>2009</Year><Month>Oct</Month><Day>08</Day></PubDate></JournalIssue><Title>BMC bioinformatics</Title><ISOAbbreviation>BMC Bioinformatics</ISOAbbreviation></Journal><ArticleTitle>Exploratory visual analysis of conserved domains on multiple sequence alignments.</ArticleTitle><Pagination><StartPage>S7</StartPage><MedlinePgn>S7</MedlinePgn></Pagination><ELocationID EIdType="doi

## 2.2. Nucleotide

In [27]:
handle = Entrez.esearch(db="nucleotide", retmax="10", term="Severe acute respiratory syndrome coronavirus")
record = Entrez.read(handle)
record["IdList"]

['2607840939', '2607840925', '2607840910', '2607840894', '2607840879', '2607840864', '2607840849', '2607840834', '2607840818', '2607840803']

In [29]:
handle = Entrez.efetch(db="nucleotide", id="2607840939", rettype="gb", retmode="text")
print(handle.read())

LOCUS       OR751169               29438 bp    RNA     linear   VRL 30-OCT-2023
DEFINITION  Severe acute respiratory syndrome coronavirus 2 isolate
            SARS-CoV-2/human/USA/TN-SPHL-9726/2023 ORF1ab polyprotein (ORF1ab)
            and ORF1a polyprotein (ORF1ab) genes, partial cds; surface
            glycoprotein (S), ORF3a protein (ORF3a), envelope protein (E),
            membrane glycoprotein (M), and ORF6 protein (ORF6) genes, complete
            cds; ORF7a protein (ORF7a) and ORF8 protein (ORF8) genes, partial
            cds; and nucleocapsid phosphoprotein (N) and ORF10 protein (ORF10)
            genes, complete cds.
ACCESSION   OR751169
VERSION     OR751169.1
DBLINK      BioProject: PRJNA732685
            BioSample: SAMN38041194
KEYWORDS    .
SOURCE      Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
  ORGANISM  Severe acute respiratory syndrome coronavirus 2
            Viruses; Riboviria; Orthornavirae; Pisuviricota; Pisoniviricetes;
            Nidov

In [30]:
handle = Entrez.esearch(db='nucleotide', term='accD[Gene Name] AND "E. coli"[Organism]', retmax="20")
result_list = Entrez.read(handle)

In [31]:
id_list = result_list['IdList']
count = result_list['Count']

print(id_list)
print("\n")
print(count)

['2608158611', '2608158607', '2608136696', '2608071221', '2608071208', '2608050907', '2608028807', '2608019261', '2607821198', '2607770425', '2607770393', '2607770389', '2607770385', '2607762529', '2607762494', '2607762474', '2607762469', '2607762467', '2607761547', '2607761542']


242094


In [32]:
handle.close()

------------------------------------------------------

# 3. PDB

### Import Modules

In [33]:
from Bio.PDB import PDBParser,PDBList

In [34]:
# help(PDBList)

In [35]:
PDBl = PDBList()
PDBl.retrieve_pdb_file('7BYR', file_format = "pdb", pdir = "dir")

Downloading PDB structure '7byr'...


'dir\\pdb7byr.ent'

In [36]:
parser = PDBParser()
structure = parser.get_structure("7BYR", "dir\\pdb7byr.ent")



In [37]:
for chain in structure[0]:
    print(f"chainid: {chain.id}")

chainid: A
chainid: B
chainid: C
chainid: H
chainid: L
chainid: D
chainid: E
chainid: F
chainid: G
chainid: I
chainid: J


In [38]:
resolution = structure.header["resolution"]
resolution

3.84

In [39]:
keywords = structure.header["keywords"]
keywords

'sars-cov-2, antigen, rbd, neutralizing antibody, viral protein'

------------------------------------------------------

# 4. EXPASY

## 4.1. PROSITE

### Import Modules

In [40]:
from Bio import ExPASy
from Bio.ExPASy import Prosite

In [41]:
# help(Prosite)

In [42]:
handle = ExPASy.get_prosite_raw("PS51442")
record = Prosite.read(handle)

In [43]:
print(record.description)

Coronavirus main protease (M-pro) domain profile.


In [44]:
print(record.pdb_structs[:10])

['1LVO', '1P9S', '1P9U', '1Q2W', '1UJ1', '1UK2', '1UK3', '1UK4', '1WOF', '1Z1I']


In [45]:
handle = ExPASy.get_prosite_raw("PS00001")
record = Prosite.read(handle)
print(record.pattern)

N-{P}-[ST]-{P}.


## 4.2. ScanProsite

### Import Modules

In [46]:
from Bio.ExPASy import ScanProsite

In [47]:
prot_record = SeqIO.read("prot_seq.fasta", format="fasta")
len(prot_record.seq)

258

In [48]:
handle = ScanProsite.scan(seq=prot_record.seq, mirror="https://prosite.expasy.org/")
result = ScanProsite.read(handle)

In [49]:
result.n_match

1

In [50]:
result[0]

{'sequence_ac': 'USERSEQ1',
 'start': 1,
 'stop': 118,
 'signature_ac': 'PS51921',
 'score': '32.871',
 'level': '0'}

------------------------------------------------------

# 5. KEGG

### Import Modules

In [59]:
from Bio.KEGG import REST, Enzyme
import ssl
import urllib.request

In [60]:
# help(Enzyme)

In [61]:
request = REST.kegg_get("ec:5.4.2.2")
open("ec_5.4.2.2.txt", "w").write(request.read())

272447

In [62]:
records = Enzyme.parse(open("ec_5.4.2.2.txt"))
record = list(records)[0]
record.classname

['Isomerases;',
 'Intramolecular transferases;',
 'Phosphotransferases (phosphomutases)']

In [63]:
record.pathway

[('PATH', 'ec00010', 'Glycolysis / Gluconeogenesis'),
 ('PATH', 'ec00030', 'Pentose phosphate pathway'),
 ('PATH', 'ec00052', 'Galactose metabolism'),
 ('PATH', 'ec00230', 'Purine metabolism'),
 ('PATH', 'ec00500', 'Starch and sucrose metabolism'),
 ('PATH', 'ec00520', 'Amino sugar and nucleotide sugar metabolism'),
 ('PATH', 'ec00521', 'Streptomycin biosynthesis'),
 ('PATH', 'ec01100', 'Metabolic pathways'),
 ('PATH', 'ec01110', 'Biosynthesis of secondary metabolites'),
 ('PATH', 'ec01120', 'Microbial metabolism in diverse environments')]

In [64]:
record.genes[:10]

[('HSA', ['5236', '55276']),
 ('PTR', ['456908', '461162']),
 ('PPS', ['100977295', '100993927']),
 ('GGO', ['101128874', '101131551']),
 ('PON', ['100190836', '100438793']),
 ('NLE', ['100596081', '100600656']),
 ('HMH', ['116456694', '116457795']),
 ('MCC', ['100424648', '699401']),
 ('MCF', ['101925921', '102130622']),
 ('MTHB', ['126935012', '126954887'])]

In [65]:
list_genes = []
for x, y in record.genes:
    list_genes += x.split("\n")

print(list_genes[:10])

['HSA', 'PTR', 'PPS', 'GGO', 'PON', 'NLE', 'HMH', 'MCC', 'MCF', 'MTHB']


------------------------------------------------------