# Query SRA metadata using Big Query

Using https://pandas-gbq.readthedocs.io/. 

Before you begin, you must create a Google Cloud Platform project. Use the BigQuery sandbox to try the service for free.

If you do not provide any credentials, this module attempts to load credentials from the environment. If no credentials are found, pandas-gbq prompts you to open a web browser, where you can grant it permissions to access your cloud resources. These credentials are only used locally.

In [1]:
import pandas_gbq
import pandas as pd

In [2]:
# Construct query
# Here we are using the bioproject PRJNA523380 for the CCLE cell lines
# I only want to get the RNA-seq data 
query = """
SELECT * FROM `nih-sra-datastore.sra.metadata` 
WHERE bioproject = 'PRJNA523380' AND assay_type = 'RNA-Seq'
LIMIT 5000
"""

In [3]:
# Query the metadata table in BigQuery
# Might take some time
# Returns a dataframe
df = pandas_gbq.read_gbq(query)

Downloading: 100%|[32m██████████[0m|


# Goal here is to curate metadata for the results of the query
I specifically want a dataframe with the desired metadata as well as a dictionary with the same information.

Hoping to be able to use the dictionary to add the metadata to the objects in the the GCS bucket 

In [76]:
#  TODO:: refactor these functions to be more generic
# Also improve efficiency by not iterating over the entire dataframe
# Maybe use a dictionary comprehension instead

def convert_array_to_dict(arr):
    result_dict = {}
    desired_keys = ['bases', 'bytes', 'run_file_create_date', 'disease_sam', 'disease_stage_sam_s_dpl172', 'tissue_sam']
    for item in arr:
        if isinstance(item, dict):
            key = item.get('k')
            value = item.get('v')
            if key in desired_keys:
                if key == 'bytes':
                    result_dict['size_in_bytes'] = int(value)
                    result_dict['size_in_GB'] = round(float(value) / 1000000000, 2)
                elif key == 'run_file_create_date':
                    result_dict[key] = str(value)
                else:
                    if key in result_dict:
                        if isinstance(result_dict[key], list):
                            result_dict[key].append(value)
                        else:
                            result_dict[key] = [result_dict[key], value]
                    else:
                        result_dict[key] = value
    return result_dict

def convert_row_to_dict(row):
    result_dict = {}
    for column in row.index:
        if column == 'attributes':
            result_dict.update(convert_array_to_dict(row[column]))
        elif column == 'releasedate':
            # convert type Timestamp to string
            result_dict[column] = str(row[column])
        elif column == 'run_file_create_date':
            result_dict[column] = str(row[column])
        else:
            result_dict[column] = row[column]
    return result_dict


def convert_dataframe_to_dict(df):
    result_dict = {}
    for i in range(len(df)):
        result_dict[df['acc'][i]] = convert_row_to_dict(df.iloc[i])
    return result_dict


In [53]:
# choose columns
columns = ['acc', 'sample_name', 'sample_acc', 'experiment',  'library_name', 'sra_study', 'center_name', 
'platform', 'assay_type', 'librarysource', 'organism', 'releasedate']

# subset the dataframe to only include the columns we want
df_ = df[columns + ['attributes']].copy()

# # for each column in columns, print out the number of unique values, and then the first 5 unique values
for col in columns:
    # print(f"For {col}, there are: {df[col].nunique()} unique values. \nExamples: {df[col].unique()[0:5]}\n")
    print(f"{col} has {df[col].nunique()} unique values. \nExamples: {df[col].unique()[0:5]}\n")

acc has 1019 unique values. 
Examples: ['SRR8616020' 'SRR8615545' 'SRR8615484' 'SRR8615991' 'SRR8615876']

sample_name has 1019 unique values. 
Examples: ['A375_SKIN' 'LN443_CENTRAL_NERVOUS_SYSTEM' 'JHESOAD1_OESOPHAGUS'
 'COLO680N_OESOPHAGUS' 'OV56_OVARY']

sample_acc has 1019 unique values. 
Examples: ['SRS4395948' 'SRS4395810' 'SRS4395335' 'SRS4395972' 'SRS4396070']

experiment has 1019 unique values. 
Examples: ['SRX5415030' 'SRX5414855' 'SRX5414269' 'SRX5415059' 'SRX5415174']

library_name has 1019 unique values. 
Examples: ['RNASeq-A375_SKIN' 'RNASeq-LN443_CENTRAL_NERVOUS_SYSTEM'
 'RNASeq-JHESOAD1_OESOPHAGUS' 'RNASeq-COLO680N_OESOPHAGUS'
 'RNASeq-OV56_OVARY']

sra_study has 1 unique values. 
Examples: ['SRP186687']

center_name has 1 unique values. 
Examples: ['BROAD INSTITUTE']

platform has 1 unique values. 
Examples: ['ILLUMINA']

assay_type has 1 unique values. 
Examples: ['RNA-Seq']

librarysource has 1 unique values. 
Examples: ['TRANSCRIPTOMIC']

organism has 1 unique value

In [77]:
# Subset df to df_ using columns but also include the 'attributes' column
dict_metadata = convert_dataframe_to_dict(df_)
dict_metadata

{'SRR8616020': {'acc': 'SRR8616020',
  'sample_name': 'A375_SKIN',
  'sample_acc': 'SRS4395948',
  'experiment': 'SRX5415030',
  'library_name': 'RNASeq-A375_SKIN',
  'sra_study': 'SRP186687',
  'center_name': 'BROAD INSTITUTE',
  'platform': 'ILLUMINA',
  'assay_type': 'RNA-Seq',
  'librarysource': 'TRANSCRIPTOMIC',
  'organism': 'Homo sapiens',
  'releasedate': '2019-03-27 00:00:00+00:00',
  'bases': '15501388292',
  'size_in_bytes': 7802508320,
  'size_in_GB': 7.8,
  'disease_sam': 'malignant_melanoma',
  'disease_stage_sam_s_dpl172': 'primary',
  'tissue_sam': 'skin'},
 'SRR8615545': {'acc': 'SRR8615545',
  'sample_name': 'LN443_CENTRAL_NERVOUS_SYSTEM',
  'sample_acc': 'SRS4395810',
  'experiment': 'SRX5414855',
  'library_name': 'RNASeq-LN443_CENTRAL_NERVOUS_SYSTEM',
  'sra_study': 'SRP186687',
  'center_name': 'BROAD INSTITUTE',
  'platform': 'ILLUMINA',
  'assay_type': 'RNA-Seq',
  'librarysource': 'TRANSCRIPTOMIC',
  'organism': 'Homo sapiens',
  'releasedate': '2019-03-27 00:0

In [80]:
# convert dict to pandas dataframe
df_metadata = pd.DataFrame.from_dict(dict_metadata, orient='index')
# rename the 'acc' column to 'run_accession'
df_metadata.rename(columns={'acc': 'run_accession'}, inplace=True)
df_metadata

Unnamed: 0,run_accession,sample_name,sample_acc,experiment,library_name,sra_study,center_name,platform,assay_type,librarysource,organism,releasedate,bases,size_in_bytes,size_in_GB,disease_sam,disease_stage_sam_s_dpl172,tissue_sam,run_file_create_date
SRR8616020,SRR8616020,A375_SKIN,SRS4395948,SRX5415030,RNASeq-A375_SKIN,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,15501388292,7802508320,7.80,malignant_melanoma,primary,skin,
SRR8615545,SRR8615545,LN443_CENTRAL_NERVOUS_SYSTEM,SRS4395810,SRX5414855,RNASeq-LN443_CENTRAL_NERVOUS_SYSTEM,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,16018583436,8570652102,8.57,glioma (astrocytoma_Grade_IV),,central_nervous_system,2019-02-23T23:31:00.000Z
SRR8615484,SRR8615484,JHESOAD1_OESOPHAGUS,SRS4395335,SRX5414269,RNASeq-JHESOAD1_OESOPHAGUS,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,16625606768,8972531697,8.97,carcinoma (barrett_associated_adenocarcinoma),,oesophagus,2019-02-23T23:25:00.000Z
SRR8615991,SRR8615991,COLO680N_OESOPHAGUS,SRS4395972,SRX5415059,RNASeq-COLO680N_OESOPHAGUS,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,21067870376,11211586013,11.21,carcinoma (squamous_cell_carcinoma),primary,oesophagus,2019-02-24T00:45:00.000Z
SRR8615876,SRR8615876,OV56_OVARY,SRS4396070,SRX5415174,RNASeq-OV56_OVARY,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,21288728086,11189454192,11.19,carcinoma,primary,ovary,2019-02-24T00:21:00.000Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8616113,SRR8616113,NCIH2286_LUNG,SRS4395876,SRX5414937,RNASeq-NCIH2286_LUNG,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,19132622506,9442799741,9.44,carcinoma (small_cell_carcinoma),primary,lung,2019-02-24T00:57:00.000Z
SRR8615935,SRR8615935,SNU175_LARGE_INTESTINE,SRS4395226,SRX5415115,RNASeq-SNU175_LARGE_INTESTINE,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,17110555642,8598498804,8.60,carcinoma,primary,large_intestine,2019-02-24T00:23:00.000Z
SRR8615470,SRR8615470,TE4_OESOPHAGUS,SRS4395348,SRX5414283,RNASeq-TE4_OESOPHAGUS,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,12096084008,5789228790,5.79,carcinoma (squamous_cell_carcinoma),primary,oesophagus,2019-02-23T23:10:00.000Z
SRR8615458,SRR8615458,HCC38_BREAST,SRS4395357,SRX5414295,RNASeq-HCC38_BREAST,SRP186687,BROAD INSTITUTE,ILLUMINA,RNA-Seq,TRANSCRIPTOMIC,Homo sapiens,2019-03-27 00:00:00+00:00,19523122846,9601053388,9.60,carcinoma (ductal_carcinoma),primary,breast,2019-02-23T23:20:00.000Z


In [82]:
# save the dataframe to a csv file 
df_metadata.to_csv('../../metadata/sra_metadata.csv', index=False)

# save the dictionary to a json file
import json
with open('../../metadata/sra_metadata.json', 'w') as fp:
    json.dump(dict_metadata, fp)

