In [9]:
#%conda install jupyter notebook -U
from ncbi.datasets import GenomeApi as DatasetsGenomeApi
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
import pandas as pd
from pysradb.sraweb import SRAweb

In [10]:
def get_metadata_genome_api (bioprojects) -> list:
    result = list()
    #CONNECT WITH GENOME API
    with DatasetsApiClient() as api_client:
        genome_api = DatasetsGenomeApi(api_client)
        assemblies = genome_api.assembly_descriptors_by_bioproject(bioprojects)
        #print (assemblies)
        assemblies_dict = assemblies.to_dict()
        
        # ITER THROUGH ASSEMBLIES
        for assembly in assemblies_dict["assemblies"]:
            # CHECK KEYS WHICH STORE SRA EXIST
            if not ("biosample" in assembly["assembly"].keys()):
                continue
            elif not("sample_ids" in assembly["assembly"]["biosample"].keys()):
                continue
            for record in assembly["assembly"]["biosample"]["sample_ids"]:
                if "SRA" in record.values():
                    #print(assembly["assembly"]["assembly_accession"])
                    #print (assembly["assembly"]["bioproject_lineages"])
                    #print (assembly["assembly"]["biosample"]["sample_ids"])
                    #print (assembly["assembly"]["biosample"]["description"]["organism"]["organism_name"])
                    result.append((assembly["assembly"]["assembly_accession"],
                                assembly["assembly"]["bioproject_lineages"],
                                assembly["assembly"]["biosample"]["sample_ids"],
                                assembly["assembly"]["biosample"]["description"]["organism"]["organism_name"]
                                ))
        return result


In [11]:
# PARSE CSV FILE DOWNLOADED FROM NCBI
def parse_data (file) -> list:
    data = pd.read_csv(file, delimiter=",")
    bioprojects=data["BioProject"].to_list()
    return bioprojects

In [22]:
# ASSOCIATE GENOME ACCESSION TO A RECORD IN THE INITIAL DATASET
def get_more_info_filtered_mito (created_file, original_file):
    my_data= pd.read_excel(created_file, index_col=0)
    data_original= pd.read_csv(original_file, delimiter=",")
    data_original= data_original[data_original["Assembly"].notna()]
    new_data= my_data.set_index(0).join(data_original.set_index('Assembly'))
    new_data.rename(columns = {1:'bioprojects', 2:'SRA_accession', 3:'organism_name_bioproject'}, inplace = True)
    new_data.to_excel("mito_ncbi_final.xlsx")
    return new_data.head()

In [13]:
# GO THROUGH EVERY RECORD AND FILTER MITOCONDRIAL GENOMES BY SRA
def filt_mito():
    result = list()
    data = parse_data("organelles.csv")
    
    # ITERATE EVERY 500 FILES SINCE API DOES NOT SUPPORT MORE THAN THAT
    for span in range (0, len (data), 500):
        bioprojects=data[span:span+500]
    #bioprojects = ["PRJNA48091"]
        result = result + get_metadata_genome_api(bioprojects)
    
    dataframe= pd.DataFrame(result)
    dataframe.to_excel("mito_ncbi.xlsx")
    print ("MITOCHONDRIAS FILTERED BY SRA")

In [81]:
# GET SRA METADATA OF FILTERED MITOCHONDRIAL GENOMES
def get_sra_info_filtered_mito():
    db = SRAweb()
    sra= pd.read_excel("mito_ncbi_final.xlsx", index_col=0)
    sra_list = list()
    for sras in sra['SRA_accession']:
        sras= sras.strip('][').split(', ')
        for index in range(0,len(sras)):
            if "SRA" in sras[index]:
                sra_list.append(sras[index+1].split("'")[3])
                break
    df = db.sra_metadata(sra_list)
    df= df[['organism_name',"instrument",'instrument_model',
            'total_size']] #'run_total_bases'
    df.to_excel("sra_metadata.xlsx")
    return df.sort_values(by='organism_name').set_index('organism_name').head()

In [240]:
# DESCRIPTIVE STATISTICS
from itertools import count


def analyse_dataset (file):
    df= pd.read_excel(file, index_col=0)
    df=df.groupby(['organism_name','instrument']).agg({'instrument':'count'}) \
        .assign(percentage=lambda x: x.instrument/x.instrument.sum()*100)
    #df.to_excel("sra_metadata_analysis_results.xlsx")
    return df.head(15)

In [241]:
#filt_mito()
#get_more_info_filtered_mito("mito_ncbi.xlsx", 'organelles.csv')
#get_sra_info_filtered_mito()
analyse_dataset('sra_metadata.xlsx')

Unnamed: 0_level_0,Unnamed: 1_level_0,instrument,percentage
organism_name,instrument,Unnamed: 2_level_1,Unnamed: 3_level_1
Abiotrophia defectiva,Illumina HiSeq 4000,1,0.02058
Abiotrophia defectiva,PacBio RS,2,0.041161
Acanthaster planci,Illumina HiSeq 2000,8,0.164643
Acanthaster planci,Illumina MiSeq,3,0.061741
Acanthochromis polyacanthus,PacBio RS II,109,2.24326
Acer yangbiense,HiSeq X Ten,2,0.041161
Acer yangbiense,Sequel,1,0.02058
Achromobacter deleyi,Illumina HiSeq 4000,1,0.02058
Achromobacter deleyi,PacBio RS,2,0.041161
Achromobacter denitrificans,Illumina HiSeq 4000,3,0.061741
