# CCLE RAW DATA EXPLORATION
### This notebook is for reading in a set of metadata files from various sources to create a database of metadata + file location for the raw CCLE data

In [1]:
import pandas as pd
import requests
import os

# Data from the TERRA platform
https://app.terra.bio/#workspaces/fccredits-silver-tan-7621/CCLE_v2

Each cell contains a link to the google bucket location for each 

Obtained : August 2023

The following raw sequencing data is available for download:

- RNAseq (1025) [HG38]
- WES (478) [HG38 + HG19]
- WGS (329) [HG38 + HG19]
- RRBS (unfiltered) (927) [HG19]
- HC (976) [HG19]
- Raindance (782) [HG19]



In [2]:
# Read in file at ./terra_combined_paths.csv
file_path = './Terra_ExportedList.tsv'
terra_df = pd.read_csv(file_path, sep='\t')
terra_df.head()

Unnamed: 0,entity:sample_id,hg19_RRBS_bai,hg19_RRBS_bam,hg19_hybrid_capture_bai,hg19_hybrid_capture_bam,hg19_raindance_bai,hg19_raindance_bam,hg19_targeted_bai,hg19_targeted_bam,hg38_rna_bai,...,hg38_wes_bai,hg38_wes_bam,hg38_wgs_bai,hg38_wgs_bam,mutect2_parquet_wes,mutect2_parquet_wgs,mutect2_vcf_wes,mutect2_vcf_wgs,participant,stripped_cell_line_name
0,ACH-000001,gs://cclebams/RRBS/G29750/NIH_OVCAR-3/v1/NIH_O...,gs://cclebams/RRBS/G29750/NIH_OVCAR-3/v1/NIH_O...,gs://cclebams/hybrid_capture/NIHOVCAR3_OVARY.bai,gs://cclebams/hybrid_capture/NIHOVCAR3_OVARY.bam,gs://cclebams/raindance/G16640/NIH_OVCAR-3/cur...,gs://cclebams/raindance/G16640/NIH_OVCAR-3/cur...,,,gs://cclebams/rnasq_hg38/CDS-VqxBGH.Aligned.so...,...,,,gs://cclebams/wgs_hg38/CDS-1pLoxn.wgs_ccle.bai,gs://cclebams/wgs_hg38/CDS-1pLoxn.wgs_ccle.bam,,"[""gs://ccle-mutation/mutect2_parquet/0f5356e34...",,gs://ccle-mutation/mutect2_vcf/CDS-1pLoxn_fixe...,ACH-000001,NIHOVCAR3
1,ACH-000002,gs://cclebams/RRBS/G29750/HL-60/v2/HL-60.bai,gs://cclebams/RRBS/G29750/HL-60/v2/HL-60.bam,gs://cclebams/hybrid_capture/HL-60.bai,gs://cclebams/hybrid_capture/HL-60.bam,gs://cclebams/raindance/HL-60.bai,gs://cclebams/raindance/HL-60.bam,,,gs://cclebams/rnasq_hg38/CDS-AExPJi.Aligned.so...,...,,,,,,,,,ACH-000002,HL60
2,ACH-000003,,,,,,,,,gs://cclebams/rnasq_hg38/CDS-TyidZO.Aligned.so...,...,,,,,,,,,ACH-000003,CACO2
3,ACH-000004,gs://cclebams/RRBS/G48942/HEL/current/HEL.bai,gs://cclebams/RRBS/G48942/HEL/current/HEL.bam,gs://cclebams/hybrid_capture/HEL_HAEMATOPOIETI...,gs://cclebams/hybrid_capture/HEL_HAEMATOPOIETI...,,,,,gs://cclebams/rnasq_hg38/CDS-Sl0JI6.Aligned.so...,...,gs://cclebams/hg38_wes/CDS-7ffA1d.hg38.bam.bai,gs://cclebams/hg38_wes/CDS-7ffA1d.hg38.bam,,,"[""gs://ccle-mutation/mutect2_parquet/d7afb11fb...",,gs://ccle-mutation/mutect2_vcf/CDS-7ffA1d_fixe...,,ACH-000004,HEL
4,ACH-000005,gs://cclebams/RRBS/G29750/HEL_92.1.7/v2/HEL_92...,gs://cclebams/RRBS/G29750/HEL_92.1.7/v2/HEL_92...,gs://cclebams/hybrid_capture/HEL9217_HAEMATOPO...,gs://cclebams/hybrid_capture/HEL9217_HAEMATOPO...,gs://cclebams/raindance/G16640/HEL_92.1.7/curr...,gs://cclebams/raindance/G16640/HEL_92.1.7/curr...,,,gs://cclebams/rnasq_hg38/CDS-wsWPxR.Aligned.so...,...,gs://cclebams/hg38_wes/CDS-bweujf.hg38.bam.bai,gs://cclebams/hg38_wes/CDS-bweujf.hg38.bam,gs://cclebams/wgs_hg38/CDS-17moNf.wgs_ccle.bai,gs://cclebams/wgs_hg38/CDS-17moNf.wgs_ccle.bam,"[""gs://ccle-mutation/mutect2_parquet/04a2bf1d8...",,gs://ccle-mutation/mutect2_vcf/CDS-bweujf_fixe...,,ACH-000005,HEL9217


In [3]:
terra_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060 entries, 0 to 1059
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   entity:sample_id         1060 non-null   object
 1   hg19_RRBS_bai            927 non-null    object
 2   hg19_RRBS_bam            927 non-null    object
 3   hg19_hybrid_capture_bai  973 non-null    object
 4   hg19_hybrid_capture_bam  973 non-null    object
 5   hg19_raindance_bai       782 non-null    object
 6   hg19_raindance_bam       782 non-null    object
 7   hg19_targeted_bai        187 non-null    object
 8   hg19_targeted_bam        187 non-null    object
 9   hg38_rna_bai             1025 non-null   object
 10  hg38_rna_bam             1025 non-null   object
 11  hg38_wes_bai             474 non-null    object
 12  hg38_wes_bam             474 non-null    object
 13  hg38_wgs_bai             356 non-null    object
 14  hg38_wgs_bam             356 non-null   

# Data from ENA Web Browser
https://www.ebi.ac.uk/ena/browser/view/PRJNA523380

Obtained: August 2023

In [4]:
# read ENA_browser_PRJNA523380.tsv file and store in ena_df
ena_df = pd.read_csv('./ENA_browser_PRJNA523380.tsv', sep='\t')
ena_df.head()

Unnamed: 0,run_accession,sample_accession,secondary_sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,library_layout,library_source,...,study_title,run_alias,fastq_bytes,fastq_md5,fastq_ftp,sra_bytes,sra_md5,sra_ftp,sample_alias,sample_title
0,SRR8615033,SAMN10988141,SRS4395287,SRX5414207,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,Cancer Cell Line Encyclopedia (CCLE),HCC-56.bam,7773624;10209268,ec71a964d91c08dd4161a5c51e8077ec;f9a75f6d1eb4b...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/003/SRR861...,18104962,be2409df4f8c038b478bdef7d40683bc,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/003/SRR8615033,HCC56_LARGE_INTESTINE,HCC56_LARGE_INTESTINE
1,SRR8615034,SAMN10988287,SRS4395286,SRX5414206,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,Cancer Cell Line Encyclopedia (CCLE),HCC-78.bam,7530079;9978815,d4fd3f3baaefd30dd1b2ec8157737401;5110277bed499...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/004/SRR861...,17593943,caa4df8f8b40e416f0fa6b7f2c9328ae,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/004/SRR8615034,HCC78_LUNG,HCC78_LUNG
2,SRR8615035,SAMN10989587,SRS4395285,SRX5414205,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,Cancer Cell Line Encyclopedia (CCLE),HOP-62.bam,8058229;12501315,0912b268ba684b093da75cec18eea6d0;1112820fb4070...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/005/SRR861...,21029304,7d4124ffdd0d7dc14f3ce2a22ff4c1d2,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/005/SRR8615035,HOP62_LUNG,HOP62_LUNG
3,SRR8615042,SAMN10987804,SRS4395278,SRX5414198,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,Cancer Cell Line Encyclopedia (CCLE),NCI-H810.bam,13081808;17856519,89cd15c12b0e3f6d4aa8fa4d930c3768;ec17e81d32c65...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/002/SRR861...,30894571,984e7fe6fbcbdbf30c5e3ae1dd57cd59,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/002/SRR8615042,NCIH810_LUNG,NCIH810_LUNG
4,SRR8615047,SAMN10987678,SRS4395273,SRX5414193,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,Cancer Cell Line Encyclopedia (CCLE),NCI-H322.bam,10400525;13057832,40188cfc8cb992e831837517b08b7216;1d9cda042e114...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/007/SRR861...,23711511,648268ec2fe15e87a64f0c0478bf825c,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/007/SRR8615047,NCIH322_LUNG,NCIH322_LUNG


In [5]:
# # print out information about dataframes and their columns

ena_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4550 entries, 0 to 4549
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   run_accession               4550 non-null   object
 1   sample_accession            4550 non-null   object
 2   secondary_sample_accession  4550 non-null   object
 3   experiment_accession        4550 non-null   object
 4   study_accession             4550 non-null   object
 5   tax_id                      4550 non-null   int64 
 6   scientific_name             4550 non-null   object
 7   instrument_platform         4550 non-null   object
 8   library_layout              4550 non-null   object
 9   library_source              4550 non-null   object
 10  read_count                  4550 non-null   int64 
 11  base_count                  4550 non-null   int64 
 12  first_public                4550 non-null   object
 13  last_updated                4550 non-null   obje

# Data from SRA RUN SELECTOR
https://www.ncbi.nlm.nih.gov/Traces/study/?query_key=2&WebEnv=MCID_64deb5fe5542df760f04c853&o=acc_s%3Aa > Next to 4550 Runs, click "Metadata" 

Obtained: August 2023 

In [6]:
# read ENA_browser_PRJNA523380.tsv file and store in ena_df
sra_df = pd.read_csv('./NCBI_SRA_runTable.csv', sep=',')
sra_df.head()

Unnamed: 0,Run,Age,Assay Type,AssemblyName,AvgSpotLen,Bases,BIOMATERIAL_PROVIDER,BioProject,BioSample,BioSampleModel,...,Organism,Platform,ReleaseDate,Sample Name,Sample_type,sex,SRA Study,Tissue,create_date,version
0,SRR8633202,66.0,Bisulfite-Seq,GCA_000001405.13,29,1556265367,DSMZ:HUP-T3,PRJNA523380,SAMN10987811,Human,...,Homo sapiens,ILLUMINA,2019-03-27T00:00:00Z,HUPT3_PANCREAS,cell culture,male,SRP186687,pancreas,2019-02-27T12:24:00Z,1.0
1,SRR8633203,74.0,Bisulfite-Seq,GCA_000001405.13,29,1727947890,ATCC:Hs 578T,PRJNA523380,SAMN10987893,Human,...,Homo sapiens,ILLUMINA,2019-03-27T00:00:00Z,HS578T_BREAST,cell culture,female,SRP186687,breast,2019-02-27T12:25:00Z,1.0
2,SRR8633204,21.0,Bisulfite-Seq,GCA_000001405.13,29,3430374736,ATCC:Hs 604.T,PRJNA523380,SAMN10987929,Human,...,Homo sapiens,ILLUMINA,2019-03-27T00:00:00Z,HS604T_FIBROBLAST,cell culture,male,SRP186687,haematopoietic_and_lymphoid_tissue,2019-02-27T12:31:00Z,1.0
3,SRR8633205,,Bisulfite-Seq,GCA_000001405.13,29,1359691158,RIKEN:HuH-6,PRJNA523380,SAMN10987713,Human,...,Homo sapiens,ILLUMINA,2019-03-27T00:00:00Z,HUH6_LIVER,cell culture,male,SRP186687,liver,2019-02-27T12:23:00Z,1.0
4,SRR8633206,,Bisulfite-Seq,GCA_000001405.13,29,2005765425,HSSRB,PRJNA523380,SAMN10987843,Human,...,Homo sapiens,ILLUMINA,2019-03-27T00:00:00Z,HUH28_BILIARY_TRACT,cell culture,,SRP186687,biliary_tract,2019-02-27T12:26:00Z,1.0


In [7]:
# print out information about dataframes and their columns
sra_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4550 entries, 0 to 4549
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Run                   4550 non-null   object 
 1   Age                   3573 non-null   float64
 2   Assay Type            4550 non-null   object 
 3   AssemblyName          4550 non-null   object 
 4   AvgSpotLen            4550 non-null   int64  
 5   Bases                 4550 non-null   int64  
 6   BIOMATERIAL_PROVIDER  4183 non-null   object 
 7   BioProject            4550 non-null   object 
 8   BioSample             4550 non-null   object 
 9   BioSampleModel        4550 non-null   object 
 10  Bytes                 4550 non-null   int64  
 11  Cell_line             4545 non-null   object 
 12  Center Name           4550 non-null   object 
 13  Consent               4550 non-null   object 
 14  DATASTORE filetype    4550 non-null   object 
 15  DATASTORE provider   

# Getting RNA

In [8]:

terra_df[['stripped_cell_line_name']]

Unnamed: 0,stripped_cell_line_name
0,NIHOVCAR3
1,HL60
2,CACO2
3,HEL
4,HEL9217
...,...
1055,8305C
1056,8505C
1057,PLCPRF5
1058,TT


In [9]:
# Each row in ena_df['sample_alias'] is a concatenation of {CELLLINE}_{TISSUE}
# We want to split this into two columns: 'CELLLINE' and 'TISSUE'
# We can do this by splitting the string on the first underscore character

# Split the sample_alias column on the first underscore character
ena_df[['CELLLINE', 'TISSUE']] = ena_df['sample_alias'].str.split('_', 1, expand=True)
ena_df.head()

  ena_df[['CELLLINE', 'TISSUE']] = ena_df['sample_alias'].str.split('_', 1, expand=True)


Unnamed: 0,run_accession,sample_accession,secondary_sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,library_layout,library_source,...,fastq_bytes,fastq_md5,fastq_ftp,sra_bytes,sra_md5,sra_ftp,sample_alias,sample_title,CELLLINE,TISSUE
0,SRR8615033,SAMN10988141,SRS4395287,SRX5414207,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,7773624;10209268,ec71a964d91c08dd4161a5c51e8077ec;f9a75f6d1eb4b...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/003/SRR861...,18104962,be2409df4f8c038b478bdef7d40683bc,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/003/SRR8615033,HCC56_LARGE_INTESTINE,HCC56_LARGE_INTESTINE,HCC56,LARGE_INTESTINE
1,SRR8615034,SAMN10988287,SRS4395286,SRX5414206,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,7530079;9978815,d4fd3f3baaefd30dd1b2ec8157737401;5110277bed499...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/004/SRR861...,17593943,caa4df8f8b40e416f0fa6b7f2c9328ae,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/004/SRR8615034,HCC78_LUNG,HCC78_LUNG,HCC78,LUNG
2,SRR8615035,SAMN10989587,SRS4395285,SRX5414205,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,8058229;12501315,0912b268ba684b093da75cec18eea6d0;1112820fb4070...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/005/SRR861...,21029304,7d4124ffdd0d7dc14f3ce2a22ff4c1d2,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/005/SRR8615035,HOP62_LUNG,HOP62_LUNG,HOP62,LUNG
3,SRR8615042,SAMN10987804,SRS4395278,SRX5414198,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,13081808;17856519,89cd15c12b0e3f6d4aa8fa4d930c3768;ec17e81d32c65...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/002/SRR861...,30894571,984e7fe6fbcbdbf30c5e3ae1dd57cd59,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/002/SRR8615042,NCIH810_LUNG,NCIH810_LUNG,NCIH810,LUNG
4,SRR8615047,SAMN10987678,SRS4395273,SRX5414193,PRJNA523380,9606,Homo sapiens,ILLUMINA,PAIRED,GENOMIC,...,10400525;13057832,40188cfc8cb992e831837517b08b7216;1d9cda042e114...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR861/007/SRR861...,23711511,648268ec2fe15e87a64f0c0478bf825c,ftp.sra.ebi.ac.uk/vol1/srr/SRR861/007/SRR8615047,NCIH322_LUNG,NCIH322_LUNG,NCIH322,LUNG


In [10]:
# sra_df already has a cell_line column but its format might not be acceptable so split the Sample Name column
sra_df[['CELLLINE', 'TISSE']] = sra_df['Sample Name'].str.split('_', 1, expand=True)
sra_df[['Cell_line', 'CELLLINE', 'TISSE']]

  sra_df[['CELLLINE', 'TISSE']] = sra_df['Sample Name'].str.split('_', 1, expand=True)


Unnamed: 0,Cell_line,CELLLINE,TISSE
0,HUP-T3,HUPT3,PANCREAS
1,Hs 578T,HS578T,BREAST
2,Hs 604.T,HS604T,FIBROBLAST
3,HuH-6,HUH6,LIVER
4,HuH28,HUH28,BILIARY_TRACT
...,...,...,...
4545,DAN-G,DANG,PANCREAS
4546,DMS 114,DMS114,LUNG
4547,DMS 273,DMS273,LUNG
4548,QGP-1,QGP1,PANCREAS


In [15]:
sra_df.head()

Unnamed: 0,Run,Age,Assay Type,AssemblyName,AvgSpotLen,Bases,BIOMATERIAL_PROVIDER,BioProject,BioSample,BioSampleModel,...,ReleaseDate,Sample Name,Sample_type,sex,SRA Study,Tissue,create_date,version,CELLLINE,TISSE
0,SRR8633202,66.0,Bisulfite-Seq,GCA_000001405.13,29,1556265367,DSMZ:HUP-T3,PRJNA523380,SAMN10987811,Human,...,2019-03-27T00:00:00Z,HUPT3_PANCREAS,cell culture,male,SRP186687,pancreas,2019-02-27T12:24:00Z,1.0,HUPT3,PANCREAS
1,SRR8633203,74.0,Bisulfite-Seq,GCA_000001405.13,29,1727947890,ATCC:Hs 578T,PRJNA523380,SAMN10987893,Human,...,2019-03-27T00:00:00Z,HS578T_BREAST,cell culture,female,SRP186687,breast,2019-02-27T12:25:00Z,1.0,HS578T,BREAST
2,SRR8633204,21.0,Bisulfite-Seq,GCA_000001405.13,29,3430374736,ATCC:Hs 604.T,PRJNA523380,SAMN10987929,Human,...,2019-03-27T00:00:00Z,HS604T_FIBROBLAST,cell culture,male,SRP186687,haematopoietic_and_lymphoid_tissue,2019-02-27T12:31:00Z,1.0,HS604T,FIBROBLAST
3,SRR8633205,,Bisulfite-Seq,GCA_000001405.13,29,1359691158,RIKEN:HuH-6,PRJNA523380,SAMN10987713,Human,...,2019-03-27T00:00:00Z,HUH6_LIVER,cell culture,male,SRP186687,liver,2019-02-27T12:23:00Z,1.0,HUH6,LIVER
4,SRR8633206,,Bisulfite-Seq,GCA_000001405.13,29,2005765425,HSSRB,PRJNA523380,SAMN10987843,Human,...,2019-03-27T00:00:00Z,HUH28_BILIARY_TRACT,cell culture,,SRP186687,biliary_tract,2019-02-27T12:26:00Z,1.0,HUH28,BILIARY_TRACT


In [16]:
# get the row with the lowest sra_bytes in ena_df
ena_df.loc[ena_df['sra_bytes'].idxmin()]


run_accession                                                        SRR8615081
sample_accession                                                   SAMN10987705
secondary_sample_accession                                           SRS4395239
experiment_accession                                                 SRX5414159
study_accession                                                     PRJNA523380
tax_id                                                                     9606
scientific_name                                                    Homo sapiens
instrument_platform                                                    ILLUMINA
library_layout                                                           PAIRED
library_source                                                          GENOMIC
read_count                                                                10818
base_count                                                              3245400
first_public                            

In [18]:
# get row where run_accession is SRR8615788
ena_df.loc[ena_df['run_accession'] == 'SRR8615788']['sra_bytes']


314    8680933841
Name: sra_bytes, dtype: int64