## **Accessing NCBI database**

**1.** Import the necessary module and set the email address.

In [1]:
import Bio #this is the biopython package
from Bio import Entrez, Medline, SeqIO
Entrez.email = "yourname@gmail.com" #tell NCBI who you are

**2.** Next, we will use the `Entrez.esearch()` function to search the nucleotide database for `CRT gene` of `Plasmodium falciparum`. </br></br>
   This function returns a handle to the results, which we can read using the `Entrez.read` function:

In [25]:
handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]',
                        idtype="acc")
rec_list = Entrez.read(handle) #store the search results in a dictionary
print(rec_list["Count"]) #as of 2024-01-04, there are 3081 entries in the database
if rec_list["RetMax"] < rec_list["Count"]:
    handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]',
                            retmax=rec_list["Count"], idtype="acc") #retmax is the number of results to be returned, default is 20, reset this value for more results.
print(rec_list.keys()) #see contents in the record list

3081
dict_keys(['Count', 'RetMax', 'RetStart', 'IdList', 'TranslationSet', 'TranslationStack', 'QueryTranslation'])


3. Fetch the searching results and download these in a GenBank format. </br></br>The GenBank format data include both sequence data and useful metadata.

In [22]:
id_list = rec_list['IdList']
hdl = Entrez.efetch(db="nucleotide", id=id_list, rettype="gb")

print(hdl.readline().strip()) #show the first line of fetched file

LOCUS       OR483864                 240 bp    DNA     linear   INV 09-OCT-2023


4. Load and parse the results.

In [26]:
records = list(SeqIO.parse(hdl, "gb"))

In [35]:
for rec in records:
    print(rec.id, rec.description[:40] + "...")
    print("Sequence length: %i," % len(rec.seq))
    print("%i features," % len(rec.features), end=" ")
    print("from: %s" % rec.annotations["source"])
    print(rec.seq[:30] + "...")
    print()

OQ672451.1 Plasmodium falciparum isolate ML_14 chlo...
Sequence length: 145,
4 features, from: Plasmodium falciparum (malaria parasite P. falciparum)
TGTGCTCATGTGTTTAAACTTATTTTTAAA...

OQ672450.1 Plasmodium falciparum isolate ML_13 chlo...
Sequence length: 145,
4 features, from: Plasmodium falciparum (malaria parasite P. falciparum)
TGTGCTCATGTGTTTAAACTTATTTTTAAA...

OQ672449.1 Plasmodium falciparum isolate ML_12 chlo...
Sequence length: 145,
4 features, from: Plasmodium falciparum (malaria parasite P. falciparum)
TGTGCTCATGTGTTTAAACTTATTTTTAAA...

OQ672448.1 Plasmodium falciparum isolate ML_11 chlo...
Sequence length: 145,
4 features, from: Plasmodium falciparum (malaria parasite P. falciparum)
TGTGCTCATGTGTTTAAACTTATTTTTAAA...

OQ672447.1 Plasmodium falciparum isolate ML_10 chlo...
Sequence length: 145,
4 features, from: Plasmodium falciparum (malaria parasite P. falciparum)
TGTGCTCATGTGTTTAAACTTATTTTTAAA...

OQ672446.1 Plasmodium falciparum isolate ML_09 chlo...
Sequence length: 145

In [36]:
for feature in rec.features:
    if feature.type == 'gene':
        print(feature.qualifiers['gene'])
    elif feature.type == 'exon':
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    else:
        print('not processed:\n%s' % feature)

not processed:
type: source
location: [0:145](+)
qualifiers:
    Key: country, Value: ['Brazil']
    Key: db_xref, Value: ['taxon:5833']
    Key: isolate, Value: ['CZS_16']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Plasmodium falciparum']

['crt']
not processed:
type: mRNA
location: [<0:>145](+)
qualifiers:
    Key: gene, Value: ['crt']
    Key: product, Value: ['chloroquine resistance transporter']

not processed:
type: CDS
location: [<0:>145](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: gene, Value: ['crt']
    Key: note, Value: ['localized within the digestive vacuole membrane']
    Key: product, Value: ['chloroquine resistance transporter']
    Key: protein_id, Value: ['WHO19582.1']
    Key: translation, Value: ['CAHVFKLIFKEIKDNIFIYILSIIYLSVSVMNTIFAKRTLNKIGNYSF']



In [37]:
for name, value in rec.annotations.items():
    print("%s=%s" % (name, value))

molecule_type=DNA
topology=linear
data_file_division=INV
date=24-MAY-2023
accessions=['OQ672433']
sequence_version=1
keywords=['']
source=Plasmodium falciparum (malaria parasite P. falciparum)
organism=Plasmodium falciparum
taxonomy=['Eukaryota', 'Sar', 'Alveolata', 'Apicomplexa', 'Aconoidasida', 'Haemosporida', 'Plasmodiidae', 'Plasmodium', 'Plasmodium (Laverania)']
references=[Reference(title='Plasmodium falciparum Chloroquine-pfcrt Resistant Haplotypes in Brazilian Endemic Areas Four Decades after CQ Withdrawn', ...), Reference(title='Direct Submission', ...)]
structured_comment=defaultdict(<class 'dict'>, {'Assembly-Data': {'Sequencing Technology': 'Sanger dideoxy sequencing'}})
