## Load and explore the DREAM7 datasets

In [7]:
import pandas as pd


# DRUG RESPONSE DATA in the form of -1log10(GI50) values. The GI50 is the concentration of drug needed to inhibit growth of the cell line by 50%.
# taking the -1log10 makes higher values better response (lower concentration needed). a value of 6 means 1 micromolar concentration needed, 5 means 10 micromolar, etc.
drug_response_full = pd.read_csv('drug_response_full_matrix.csv', index_col=0)
drug_response_train = pd.read_csv('drug_response_train_matrix.csv', index_col=0)
drug_response_test = pd.read_csv('drug_response_test_matrix.csv', index_col=0)
drug_response_classification = pd.read_csv('drug_response_classification.txt', sep='\t', index_col=0)
print("Data shapes:")
print('Full drug response data shape:', drug_response_full.shape)
print('Training drug response data shape:', drug_response_train.shape)
print('Test drug response data shape:', drug_response_test.shape)
print('Classification drug response data shape:', drug_response_classification.shape)


# Load the molecular data, all molecules on the gene level will have gene names as rows and cell lines as columns

# GENE EXPRESSION and GENE-level data

# There's gene expression data in the form of RANA-seq quantification and microarrays
RNA_seq_quant = pd.read_csv('RNAseq_quantification.txt', sep='\t', index_col=0)
RNA_seq_calls = pd.read_csv('RNAseq_expressed_calls.txt', sep='\t', index_col=0)
mRNA = pd.read_csv('GeneExpression.txt' , sep='\t', index_col=0)

# There is gene-level copy number variation data
copy_number = pd.read_csv('SNP6_gene_level.txt', sep='\t', index_col=0)

# Methylation data exists too
methylation = pd.read_csv('Methylation.txt', sep='\t', index_col=0)

# There's also information about mutations in genes, in exome sequencing data 
exome_seq = pd.read_csv('ExomeSeq.txt', sep='\t', index_col=0)

# PROTEIN-LEVEL DATA
# There is protein expression data from Reverse Phase Protein Arrays (RPPA)
rppa = pd.read_csv('RPPA.txt', sep='\t', index_col=0)
print("Data shapes:")
print('RNA-seq quantification data shape:', RNA_seq_quant.shape)
print('RNA-seq expressed calls data shape:', RNA_seq_calls.shape)
print('mRNA gene expression data shape:', mRNA.shape)
print('Copy number variation data shape:', copy_number.shape)
print('Methylation data shape:', methylation.shape) 
print('Exome sequencing data shape:', exome_seq.shape)  
print('RPPA protein expression data shape:', rppa.shape)

# annotation data - this was made in 2025 by annotating the MSigDB hallmarks gene sets to the genes in our data
msig_hallmarks = pd.read_csv('genes_msig_hallmarks_annotation.csv', index_col=0)
print('MSigDB hallmarks annotation data shape:', msig_hallmarks.shape)

Data shapes:
Full drug response data shape: (53, 28)
Training drug response data shape: (35, 28)
Test drug response data shape: (18, 28)
Classification drug response data shape: (53, 28)
Data shapes:
RNA-seq quantification data shape: (36953, 45)
RNA-seq expressed calls data shape: (36953, 45)
mRNA gene expression data shape: (18632, 46)
Copy number variation data shape: (27234, 48)
Methylation data shape: (27551, 44)
Exome sequencing data shape: (33160, 26)
RPPA protein expression data shape: (131, 43)
MSigDB hallmarks annotation data shape: (42995, 50)
