# Bioinformatics 2021: Week 5

In [137]:
import pandas as pd
import numpy as np
import shutil

### Working with NCBI database

In [145]:
# Loading genomes from https://www.ncbi.nlm.nih.gov/nuccore/?term=bacteria

In [138]:
#!pip install ncbi-datasets-pylib

In [139]:
try:
    import ncbi.datasets
except ImportError:
    print('ncbi.datasets module not found. To install, run `pip install ncbi-datasets-pylib`.')

In [140]:
## start an api_instance 
api_instance = ncbi.datasets.GenomeApi(ncbi.datasets.ApiClient())

In [141]:
token = ""
e = 0
p = 0

# (1) create a list to store data for DataFrame
genomes_data = []

while True:
    # (2) download list (one page) of genomes 
    try:
        e += 1
        genomes_page = api_instance.assembly_descriptors_by_taxon(taxon=2, page_size=1000, page_token=token, filters_reference_only=True)
    except:
        print("Exception")
        if e >= 10: break        
        continue
    e = 0
    p += 1
    
    if token == "":
        print(genomes_page.total_count)
    
    for assembly_item in genomes_page.assemblies:
        assembly = assembly_item.assembly
        assembly_level = assembly.assembly_level
        
        # (3) make a dictionary with relevant data for each genome
        genomes_item = {"accession": assembly.assembly_accession, "name": assembly.display_name, 
                        "sci_name": assembly.org.sci_name, "title": assembly.org.title, "tax_id": int(assembly.org.tax_id),
                        "category": assembly.assembly_category, "level": assembly_level, "seq_length": int(assembly.seq_length)}
        
        # (4) add genome data to list
        genomes_data.append(genomes_item)
    
    # get token for next page of results
    token = genomes_page.next_page_token
    
    print(p, len(genomes_page.assemblies), token)
    if token == "" or token is None: break

# (5) transform list of dictionaries to DataFrame
df_genomes = pd.DataFrame(genomes_data)
df_genomes.set_index("accession", inplace=True)

26800
1 1000 eNrjsnVMLcrPzc9LLFZILC0uKUrMyUzNK84stsosji9KTYvPz8uphLKLUwut3J3d4g0MDCwMLc2NTPUMAfBoFfo=
2 1000 eNrjcnTMrUzOz0ksyS8ozixWKMrIrMovLshITC1KTLXKLI4vSk2Lz8/LqYSyi1MLrdyd3eINDMzNDY3NDU31DAFfAxeu
3 1000 eNrjsnZKTM7MySktVkjMKUtNrCxNzSvOLLbKLI4vSk2Lz8/LqYSyi1MLrdyd3eINDAwsTSwMzU31DAG5zBUW
4 1000 eNrjsnYqSkzOqExKTC5JLcoszVVILAOSVpnF8UWpafH5eTmVUHZxaqGVu7NbvIGBkZGhmbmlqZ4hALiiFQM=
5 1000 eNrjcnNOzUktysxLLEnMK8nMzc9LLFZIyUysyi8pyi/IyExOtMosji9KTYvPz8uphLKLUwut3J3d4g0MTIyNLU1NTPUMAecBGao=
6 1000 eNrjsnfOz03Mzc9LLFZILCxNLMlMTlRwcTS0MDe3yiyOL0pNi8/Py6mEsotTC63cnd3iDQwMTC0NLC1M9QwB388VNw==
7 1000 eNrj8nRJLS7NSctPSkwuSS3KLM1VSCwtyS8pyi/IyEwG8jyCfI2sMovji1LT4vPzciqh7OLUQit3Z7d4AyAwMjA2M9UzBAAk4Rol
8 1000 eNrjsnHNK0ktyi/LTCrKzFcoSEzOTMtMLi22yiyOL0pNi8/Py6mEsotTC63cnd3iDQwMzQ3MLYxM9QwB1ScVeg==
9 1000 eNrjsnMrLc5PSkwuSS3KLM1VyEtNLspPT81LLbbKLI4vSk2Lz8/LqYSyi1MLrdyd3eItDQxMTA3MzUz1DAEIkBZV
10 1000 eNrjsvZIzMnPzc9LLFYoLk0qSS0qSsxLTbTKLI4vSk2Lz8/LqYSyi1MLrdyd3eItDQwMDQ2NDUz1DAG5xBT+
11 1000 eNrjs

In [146]:
# show table info
df_genomes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26800 entries, GCF_000196515.1 to GCA_000381765.1
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        26800 non-null  object
 1   sci_name    26800 non-null  object
 2   title       26800 non-null  object
 3   tax_id      26800 non-null  int64 
 4   category    26800 non-null  object
 5   level       26800 non-null  object
 6   seq_length  26800 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 1.6+ MB


In [147]:
# show first N=10 rows
df_genomes.head(10)

Unnamed: 0_level_0,name,sci_name,title,tax_id,category,level,seq_length
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GCF_000196515.1,ASM19651v1,'Nostoc azollae' 0708,'Nostoc azollae' 0708,551115,representative genome,Complete Genome,5486145
GCA_000196515.1,ASM19651v1,'Nostoc azollae' 0708,'Nostoc azollae' 0708,551115,representative genome,Complete Genome,5486145
GCF_002973605.1,ASM297360v1,Abditibacterium utsteinense,Abditibacterium utsteinense,1960156,representative genome,Contig,3606331
GCA_002973605.1,ASM297360v1,Abditibacterium utsteinense,Abditibacterium utsteinense,1960156,representative genome,Contig,3606331
GCF_013267415.1,ASM1326741v1,Abiotrophia defectiva,Abiotrophia defectiva,46125,representative genome,Complete Genome,2046826
GCA_013267415.1,ASM1326741v1,Abiotrophia defectiva,Abiotrophia defectiva,46125,representative genome,Complete Genome,2046826
GCF_003725415.1,YH-panp20,Absicoccus porci,Absicoccus porci,2486576,representative genome,Scaffold,2222132
GCA_003725415.1,YH-panp20,Absicoccus porci,Absicoccus porci,2486576,representative genome,Scaffold,2222132
GCF_003151135.1,ASM315113v1,Abyssibacter profundi,Abyssibacter profundi,2182787,representative genome,Contig,3741513
GCA_003151135.1,ASM315113v1,Abyssibacter profundi,Abyssibacter profundi,2182787,representative genome,Contig,3741513


In [148]:
# this returns a list (pd.Series object) of rows in a table with True/False values for a given logical condition
df_genomes["seq_length"] > 3333333

accession
GCF_000196515.1     True
GCA_000196515.1     True
GCF_002973605.1     True
GCA_002973605.1     True
GCF_013267415.1    False
                   ...  
GCA_003470435.1    False
GCF_001570925.1    False
GCA_001570925.1    False
GCF_000381765.1    False
GCA_000381765.1    False
Name: seq_length, Length: 26800, dtype: bool

In [149]:
# example for a combined logical condition
df_selected_genomes = df_genomes[(df_genomes["seq_length"] > 4500000) & (df_genomes["seq_length"] < 4501000)]

In [150]:
df_selected_genomes

Unnamed: 0_level_0,name,sci_name,title,tax_id,category,level,seq_length
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GCF_004801395.1,ASM480139v1,Allorhizobium terrae,Allorhizobium terrae,1848972,representative genome,Contig,4500593
GCA_004801395.1,ASM480139v1,Allorhizobium terrae,Allorhizobium terrae,1848972,representative genome,Contig,4500593
GCF_002901445.1,ASM290144v1,Deinococcus koreensis,Deinococcus koreensis,2054903,representative genome,Contig,4500921
GCA_002901445.1,ASM290144v1,Deinococcus koreensis,Deinococcus koreensis,2054903,representative genome,Contig,4500921
GCF_900094955.1,IMG-taxon 2616645019 annotated assembly,Fictibacillus enclensis,Fictibacillus enclensis,1017270,representative genome,Scaffold,4500901
GCA_900094955.1,IMG-taxon 2616645019 annotated assembly,Fictibacillus enclensis,Fictibacillus enclensis,1017270,representative genome,Scaffold,4500901
GCF_002221525.1,ASM222152v1,Pseudoalteromonas espejiana DSM 9414,Pseudoalteromonas espejiana DSM 9414,1314869,representative genome,Complete Genome,4500451
GCA_002221525.1,ASM222152v1,Pseudoalteromonas espejiana DSM 9414,Pseudoalteromonas espejiana DSM 9414,1314869,representative genome,Complete Genome,4500451


In [41]:
# Task 1: select only genomes with the following properties:
# - category: "reference genome" or "representative genome"
# - level is "Complete Genome"
# - length in the range from 4 100 000 to 4 500 000
# - accession starts with "GCF_" (check https://pandas.pydata.org/docs/reference/api/pandas.Series.str.find.html)

df_selected_genomes = None # your code here

In [151]:
# Download selected genomes
nn = len(df_selected_genomes.index)
batch_size = 200
n_batch = int(np.ceil(nn / 200))
for n in range(n_batch):
    accessions = df_selected_genomes.index.to_list()[n*batch_size:(n+1)*batch_size]

    ns = str(n+1).rjust(3, "0")

    filename = f'.ncbi_genomes_{ns}.zip'
    api_response = api_instance.download_assembly_package(accessions, filename=filename, 
                                        include_annotation_type=["GENOME_GFF", "GENOME_GBFF", "RNA_FASTA", "PROT_FASTA", "GENOME_GTF"])
    pt_src = str(api_response)
    pt_dst = f'./download/ncbi_genomes_{ns}.zip'
    shutil.move(pt_src, pt_dst)
    
    print(filename)



.ncbi_genomes_001.zip


In [153]:
# Extract genomes to "./genomes" folder
!ls -al ./genomes

total 4
drwxrwx---+ 1 Administrators None 0 May  4 00:41 .
drwxrwx---+ 1 X              None 0 May  4 00:41 ..
drwxrwx---+ 1 Administrators None 0 May  3 23:43 GCA_002221525.1


### Extracting genes using annotation

In [154]:
from Bio import SeqIO

In [155]:
# get annotations
records = list(SeqIO.parse("./genomes/GCA_002221525.1/genomic.gbff", "genbank"))
for record in records:
    print(record.id, len(record.features))
    
    for feature in record.features[0:10]:
        print(" feature:", feature.location.start, feature.location.end, feature.location.strand, end="")
        if "gene" in feature.qualifiers:
            print("  ", feature.qualifiers["gene"][0], end="")
        if "product" in feature.qualifiers:
            print("  ", feature.qualifiers["product"][0], end="")
        print()

CP011028.1 6935
 feature: 0 3720756 1
 feature: 159 1551 1   dnaA
 feature: 159 1551 1   dnaA   chromosomal replication initiator protein
 feature: 1564 2668 1   dnaN
 feature: 1564 2668 1   dnaN   DNA polymerase III subunit beta
 feature: 2682 3777 1   recF
 feature: 2682 3777 1   recF   DNA replication and repair protein RecF
 feature: 3903 6216 1   gyrB
 feature: 3903 6216 1   gyrB   DNA gyrase subunit B
 feature: 6356 7259 1   glyQ
CP011029.1 1449
 feature: 0 779695 1
 feature: 160 1399 1
 feature: 160 1399 1   hypothetical protein
 feature: 1414 2377 1   parB
 feature: 1414 2377 1   parB   chromosome partitioning protein, ParB family
 feature: 2814 5958 1
 feature: 2814 5958 1   hypothetical protein
 feature: 6723 8190 1   rmuC
 feature: 6723 8190 1   rmuC   DNA recombination protein RmuC
 feature: 8235 8379 1


In [157]:
# read genome sequence
sequences = list(SeqIO.parse("./genomes/GCA_002221525.1/GCA_002221525.1_ASM222152v1_genomic.fna", "fasta"))
print(sequences[0])

ID: CP011028.1
Name: CP011028.1
Description: CP011028.1 Pseudoalteromonas espejiana strain ATCC 29659 chromosome I, complete sequence
Number of features: 0
Seq('TCAACAATTTTATAAAAATCAATCTTTTACTTGTGGATAAAGTGCCTTCATAAT...ATA')


In [159]:
# extract sub-sequence (gene)
start = 0
end = 10
str(sequences[0][start:end].seq)

'TCAACAATTT'

In [161]:
# Task 2: extract "16S ribosomal RNA" gene sequences from each organism:
# - first locate genes with product = "16S ribosomal RNA"
# - next get their coordinates (start and end)
# - finally extract substrings from full genomic sequence

def extract_gene(genome_id, gene_name):
    sequences = []
    # your code
    
    return sequences

# Example output: ["AGATAGATGAT...", "GGGATATAGTTT...", ...]

In [167]:
# Task 3: collect stat of gene count for "16S ribosomal RNA" gene
def gene_count(gene_name):
    # your code
    return df_gene_stat

# Example output: 
df_example = pd.DataFrame([{"genome-id": "GCA_000000001", "gene-count": 3}, 
                           {"genome-id": "GCA_000000002", "gene-count": 4}, 
                           {"genome-id": "GCA_000000003", "gene-count": 7}])
df_example.set_index("genome-id", inplace=True)
print(df_example)

               gene-count
genome-id                
GCA_000000001           3
GCA_000000002           4
GCA_000000003           7


In [None]:
student_name = "Student"
df_gene_stat = gene_count("16S ribosomal RNA")
df_gene_stat.to_csv(f"./{student_name}.csv")

### Building a tree using extracted genes

In [160]:
# Task 4: build a tree using thus extracted 16S rRNA sequences:
# - select ONE (!) 16S rRNA gene from each genome
# - build a tree using 16S rRNA sequences (see Week 4)

def build_tree(genes):
    # your code
    return tree