### Get fasta headers for use in drop down menu

In [124]:

from sqlalchemy import create_engine, MetaData, Table, select, join
import pandas as pd
import re

# SQLite path
db_path = 'sqlite:///../../../data/SQLite Database/20200527/Covid-19 Study DB.sqlite'

omics_id_dict = {
        "proteomics":1,
        "lipidomics":2,
        "metabolomics":3,
        "transcriptomics":4
    }

def get_biomolecule_names(dataset='proteomics'):

    omics_id = omics_id_dict[dataset]

    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
    engine = create_engine(db_path)

    # Establish connection
    connection = engine.connect()

    query = "SELECT * from biomolecules WHERE omics_id={} and KEEP=1".format(omics_id)
    # get biomolecule names
    biomolecules_df = pd.read_sql_query(query, connection)

    # build biomolecule name dict and drop list
    biomolecule_name_dict = {}
    for index, row in biomolecules_df.iterrows():
        biomolecule_id = str(row['biomolecule_id'])
        standardized_name = row['standardized_name']
        biomolecule_name_dict[biomolecule_id] = standardized_name

    # return dictionary with biomolecule ids and standard names

    if not dataset=="proteomics":

        # close DB connection
        connection.close()



        return biomolecule_name_dict

    # for proteomics data, return fasta headers instead

    query = "SELECT * from metadata"
    # get biomolecule names
    metadata_df = pd.read_sql_query(query, connection)

    ## NOTE: Could swap this with fasta headers once they're available
    gene_name_df = metadata_df[metadata_df['metadata_type'] == 'gene_name']
    gene_name_df = gene_name_df.astype({'biomolecule_id': 'str'})
    
    fasta_header_df = metadata_df[metadata_df['metadata_type'] == 'fasta_header']
    fasta_header_df = fasta_header_df.astype({'biomolecule_id': 'str'})

    for biomolecule_id in biomolecule_name_dict:
        # update to gene name
        #gene_name = gene_name_df[gene_name_df['biomolecule_id']==biomolecule_id]['metadata_value'].values[0]
        fasta_header = fasta_header_df[fasta_header_df['biomolecule_id']==biomolecule_id]['metadata_value'].values[0]
        fasta_header = re.search("\s(.*?)\sO[SX]=", fasta_header).group(1)
        biomolecule_name_dict[biomolecule_id] = fasta_header
        

    # close DB connection
    connection.close()

    return biomolecule_name_dict, fasta_header_df

In [125]:
biomolecule_name_dict, fasta_header_df = get_biomolecule_names()

In [126]:
biomolecule_name_dict

{'7593': 'Alpha-1-antitrypsin',
 '7596': 'Immunoglobulin lambda variable 4-69',
 '7597': 'Immunoglobulin lambda variable 8-61',
 '7599': 'Immunoglobulin lambda variable 10-54',
 '7600': 'Immunoglobulin lambda variable 7-46',
 '7601': 'Immunoglobulin lambda variable 5-37',
 '7602': 'Immunoglobulin lambda variable 2-18',
 '7605': 'Immunoglobulin lambda variable 3-10',
 '7606': 'Immunoglobulin lambda variable 3-9',
 '7607': 'Immunoglobulin kappa variable 2-28',
 '7608': 'Immunoglobulin heavy variable 3-64',
 '7610': 'Immunoglobulin kappa variable 2D-29',
 '7611': 'Immunoglobulin kappa variable 1-27',
 '7612': 'Immunoglobulin kappa variable 1-37 (non-functional) (Fragment)',
 '7613': 'Immunoglobulin heavy variable 3/OR16-12 (non-functional) (Fragment)',
 '7614': 'Immunoglobulin heavy variable 1/OR15-1 (non-functional) (Fragment)',
 '7617': 'Immunoglobulin heavy variable 4-30-2',
 '7618': 'Carboxypeptidase B2',
 '7619': 'Immunoglobulin kappa variable 3D-15',
 '7622': 'Proteasome subunit bet

In [114]:
example_fasta_header = biomolecule_name_dict['7593']
print(example_fasta_header)
print()
fasta_header_regex = re.search("(.*?)\sO[SX]=", example_fasta_header).group(1)
print(fasta_header_regex)

Alpha-1-antitrypsin OS=Homo sapiens OX=9606 GN=SERPINA1 PE=1 SV=1;tr|A0A0G2JRN3|A0A0G2JRN3_HUMAN Alpha-1-antitrypsin OS=Homo sapiens

Alpha-1-antitrypsin


In [28]:
fasta_header_df

Unnamed: 0,metadata_id,biomolecule_id,metadata_type,metadata_value
45444,45445,7593,fasta_header,tr|A0A024R6I7|A0A024R6I7_HUMAN Alpha-1-antitry...
45445,45446,7594,fasta_header,tr|A0A024RA52|A0A024RA52_HUMAN Proteasome subu...
45446,45447,7595,fasta_header,tr|A0A075B6H7|A0A075B6H7_HUMAN Immunoglobulin ...
45447,45448,7596,fasta_header,sp|A0A075B6H9|LV469_HUMAN Immunoglobulin lambd...
45448,45449,7597,fasta_header,sp|A0A075B6I0|LV861_HUMAN Immunoglobulin lambd...
...,...,...,...,...
46184,46185,8333,fasta_header,tr|U3KPS2|U3KPS2_HUMAN Myeloblastin OS=Homo sa...
46185,46186,8334,fasta_header,tr|U3KQK0|U3KQK0_HUMAN Histone H2B OS=Homo sap...
46186,46187,8335,fasta_header,tr|V9GY83|V9GY83_HUMAN Low affinity immunoglob...
46187,46188,8336,fasta_header,tr|V9GYM3|V9GYM3_HUMAN Apolipoprotein A-II OS=...


In [24]:
biomolecule_id

'7593'

In [35]:
fasta_header_df[fasta_header_df['biomolecule_id']==biomolecule_id]

Unnamed: 0,metadata_id,biomolecule_id,metadata_type,metadata_value
46188,46189,8337,fasta_header,tr|X6R8F3|X6R8F3_HUMAN Neutrophil gelatinase-a...
