Do not use the tree house version https://github.com/rcurrie/tumornormal/blob/master/ingest.ipynb

The only file you need to run other notesbooks is ../data/tcga_target_gtex.h5

you can delete the following files to save a little disk space

```
cd data
rm TcgaTargetGTEX_phenotype.txt.gz
rm TcgaTargetGtex_rsem_gene_tpm.gz 
rm TcgaTargetGtex_rsem_gene_tpm.h5
```

remove sample if category or target is missing

you might want drop additional gene expression features if there are a large number of log2(0.001) values. It means the original data had zero TPM

In [1]:
import pathlib as pl

import logging
from   setupLogging import setupLogging
configFilePath = setupLogging( default_path=pl.Path('../src/logging.ini.json'))
configFilePath = setupLogging( default_path=pl.Path('../src/logging.test.ini.json'))
logger = logging.getLogger("notebook")
logger.info("using logging configuration file:{}".format(configFilePath))

[INFO <ipython-input-1-27500482ad0e>:8 - <module>()] using logging configuration file:../src/logging.test.ini.json


In [2]:
import os
import requests
import numpy as np
import pandas as pd
import pathlib as pl
import h5py
import sys
from IPython.display import display, HTML


dataDir = pl.Path("../data")
if not dataDir.exists() :
    dataDir.mkdir()

In [None]:
see src/dataUtilities for python script version

## Down load TCGA-Target-GTex expression data

In [21]:
%%time
def downLoadGeneExpressionFromXenaHub(dataDir):
    '''
    will down load data if it is not on local disk . This can take a while
    
    arguments:
        dataDir:
            path to local data directory. type pathlib
            example: '../data'
            
    returns:
        rawDataPath, path to file on local system. type pathlib
    '''
    rawDataPath = dataDir.joinpath("TcgaTargetGtex_rsem_gene_tpm.gz")
    if not rawDataPath.exists():
        print("Downloading TCGA, TARGET and GTEX expression data from UCSC Xena")
        url = "https://toil.xenahubs.net/download/TcgaTargetGtex_rsem_gene_tpm.gz"
        print("url:{}".format(url))
        response = requests.get(
                url, 
                stream=True,
                # ERROR: cannot verify toil.xenahubs.net's certificate, issued by `/C=US/O=Amazon/OU=Server CA 1B/CN=Amazon':
                #verify=False 
                ) 
        response.raise_for_status()
        #totalLengthStr = response.headers.get('content-length')
        #totalLength = int(totalLengthStr)

        with open(rawDataPath, "wb") as f:
            dataLength = 0
            for chunk in response.iter_content(chunk_size=32768):
                dataLength += len(chunk)
                f.write(chunk)
                
    return rawDataPath

rawDataPath = downLoadGeneExpressionFromXenaHub(dataDir)
print(rawDataPath)

Downloading TCGA, TARGET and GTEX expression data from UCSC Xena
url:https://toil.xenahubs.net/download/TcgaTargetGtex_rsem_gene_tpm.gz
../data/TcgaTargetGtex_rsem_gene_tpm.gz
CPU times: user 3.25 s, sys: 2.99 s, total: 6.24 s
Wall time: 2min


In [4]:
%%time
def loadRawExpressionDF(dataDir, rawDataPath):
    '''
    This can be slow if h5 verision does not exist locally, will create data from gz
    
    arguments:
        dataDir:
            path to local data directory. type pathlib
            example: ../data/TcgaTargetGtex_rsem_gene_tpm.gz
            
    returns
        expressionDF: type pandas data frame
        The shape of the data frame matches the xena hub data
        E.G. geneExpression x samples
    '''    
    # AEDWIP load gene expresion pandas data frame
    pandasDataSourcePath = dataDir.joinpath("TcgaTargetGtex_rsem_gene_tpm.h5")
    print("sourceFile:{}".format(pandasDataSourcePath))
    if not pandasDataSourcePath.exists():
        print("Converting expression to dataframe and storing in hdf5 file")
        expressionDF = pd.read_csv(rawDataPath, sep="\t", index_col=0) \
                                    .astype(np.float32)
        
        expressionDF.to_hdf(pandasDataSourcePath, 
                                       "expression", mode="w", format="fixed")
            
    else : 
        expressionDF = pd.read_hdf(pandasDataSourcePath, "expression") 
    
    return expressionDF

expressionDF = loadRawExpressionDF(dataDir, rawDataPath)

sourceFile:../data/TcgaTargetGtex_rsem_gene_tpm.h5
[INFO utils.py:129 - _init_num_threads()] Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO utils.py:141 - _init_num_threads()] NumExpr defaulting to 8 threads.
CPU times: user 14.1 s, sys: 3.61 s, total: 17.7 s
Wall time: 17.7 s


In [5]:
# sanity check all values are supposed to be log2(TMP + 0.001)
result = (expressionDF.isnull().sum().sum()) 
assert (result == 0)

In [6]:
numGeneExpr, numSamples,  = expressionDF.shape
print("numGeneExpr:{}, numSamples:{} ".format(numGeneExpr, numSamples))

numGeneExpr:60498, numSamples:19131 


In [7]:
expressionDF.head(3)

Unnamed: 0_level_0,GTEX-S4Q7-0003-SM-3NM8M,TCGA-19-1787-01,TCGA-S9-A7J2-01,GTEX-QV31-1626-SM-2S1QC,TCGA-G3-A3CH-11,TCGA-B5-A5OE-01,GTEX-13QIC-0011-R1a-SM-5O9CJ,TCGA-B2-5641-11,GTEX-ZPCL-0126-SM-4WWC8,TARGET-20-PANGDN-09,...,TCGA-FI-A2EY-01,TCGA-55-6985-11,TCGA-EJ-5527-01,TCGA-G3-A25X-01,TCGA-24-2254-01,GTEX-11ZTS-3326-SM-5LU9Y,GTEX-VJYA-0726-SM-4KL1T,GTEX-ZA64-2126-SM-5Q5A8,GTEX-Q2AG-2826-SM-2HMJQ,GTEX-XV7Q-0426-SM-4BRVN
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000242268.2,-3.458,-9.9658,0.2998,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-4.035,-2.0529,-9.9658,-9.9658,-1.9379,1.5165,-9.9658,-2.3884,0.044,-3.3076
ENSG00000259041.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
ENSG00000270112.3,-3.6259,-3.816,-3.0469,-2.1779,-9.9658,-9.9658,-1.8314,-2.4659,-9.9658,-9.9658,...,-3.1714,-4.2934,-9.9658,-9.9658,-5.5735,-2.9324,-9.9658,-9.9658,2.5852,-3.0469


## Down load meta data

In [8]:
%%time
def downLoadPhenotypeFromXenaHub(dataDir):
    '''
    will down load data if it is not on local disk . This can take a while
    
    arguments:
        dataDir:
            path to local data directory. type pathlib
            example: '../data'
            
    returns:
        rawDataPath, path to file on local system. type pathlib
    '''
    rawDataPath = dataDir.joinpath("TcgaTargetGTEX_phenotype.txt.gz")
    if not rawDataPath.exists():
        url = "https://toil.xenahubs.net/download/TcgaTargetGTEX_phenotype.txt.gz"
        print("Downloading {}".format(url))
        with open(rawDataPath, "wb") as f:
            f.write(requests.get(url).content)
                
    return rawDataPath

phenoTypePath = downLoadPhenotypeFromXenaHub(dataDir)
print(phenoTypePath)

../data/TcgaTargetGTEX_phenotype.txt.gz
CPU times: user 206 µs, sys: 90 µs, total: 296 µs
Wall time: 238 µs


In [9]:
%%time
def loadRawPhenotypeDF(dataDir, rawDataPath):
    '''
    This can be slow if h5 verision does not exist locally, will create data from gz
    
    arguments:
        dataDir:
            path to local data directory. type pathlib
            example: .../data/TcgaTargetGTEX_phenotype.txt.gz
    '''   
    
    # rename original column names
    # Index(['detailed_category', 'primary disease or tissue', '_primary_site',
    #  '_sample_type', '_gender', '_study'],
    # dtype='object'
    
    colNames = ["id", "category", "disease", "primary_site", "sample_type", "gender", "study"]
    phenotypeDF = pd.read_table(
                        rawDataPath, compression="gzip", 
                        header=0, 
                        names=colNames,
                        sep="\t", encoding="ISO-8859-1", index_col=0, 
                        dtype="str").sort_index(axis="index")
    
    # Compute and add a tumor/normal column - TCGA and TARGET have some normal samples, GTEX is all normal.
    phenotypeDF["tumor_normal"] = phenotypeDF.apply(
        lambda row: "Normal" if row["sample_type"] in ["Cell Line", "Normal Tissue", "Solid Tissue Normal"]
    else "Tumor", axis=1)  
    
    # debug
    # for colName in phenoTypeDF.columns:
        # print("\n:{}".format(colName))
        # print( phenoTypeDF.loc[:,[colName]].isnull().sum() )
    
    return phenotypeDF

  
phenoTypeDF = loadRawPhenotypeDF(dataDir, phenoTypePath)

CPU times: user 244 ms, sys: 5.21 ms, total: 249 ms
Wall time: 250 ms


In [10]:
print(phenoTypeDF.columns)

Index(['category', 'disease', 'primary_site', 'sample_type', 'gender', 'study',
       'tumor_normal'],
      dtype='object')


In [11]:
phenoTypeDF.head()

Unnamed: 0_level_0,category,disease,primary_site,sample_type,gender,study,tumor_normal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GTEX-1117F-0226-SM-5GZZ7,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose Tissue,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0426-SM-5EGHI,Muscle - Skeletal,Muscle - Skeletal,Muscle,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0526-SM-5EGHJ,Artery - Tibial,Artery - Tibial,Blood Vessel,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0626-SM-5N9CS,Artery - Coronary,Artery - Coronary,Blood Vessel,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0726-SM-5GIEN,Heart - Atrial Appendage,Heart - Atrial Appendage,Heart,Normal Tissue,Female,GTEX,Normal


In [12]:
for colName in phenoTypeDF.columns:
    print("\n:{}".format(colName))
    print( phenoTypeDF.loc[:,[colName]].isnull().sum() )
    #print(phenoTypeDF.isnull().sum().sum())


:category
category    1
dtype: int64

:disease
disease    1
dtype: int64

:primary_site
primary_site    5
dtype: int64

:sample_type
sample_type    0
dtype: int64

:gender
gender    159
dtype: int64

:study
study    0
dtype: int64

:tumor_normal
tumor_normal    0
dtype: int64


In [13]:
def dropRowsIfMissingMetadata(geneExprDF, phenoDF): 
    '''
    drops rows from the data frames where 'category','disease' is null
    Note, drop is not done 'inplace'
    
    arguments
        geneExprDF, phenoDF:
            pandas data frames. Assume the rows are samples
            geneExprDF has gene expresion data
            phenoDF has meta data. E.G. labels
            
    returns (cleanGeneExprDF, cleanPhenoDF [null row idx])
        bad row idx are integer
    '''

    tmpDF = phenoDF.loc[:, ['category','disease']]
    # https://stackoverflow.com/a/14033137
    nullRowIdx = pd.isnull(tmpDF).any(1).to_numpy().nonzero()[0]
    
    cleanGeneExprDF = geneExprDF.drop( geneExprDF.index[ nullRowIdx ], inplace=False )
    cleanPhenoDF    =    phenoDF.drop( phenoDF.index[ nullRowIdx ],    inplace=False )
    
    return (cleanGeneExprDF, cleanPhenoDF, nullRowIdx)

In [14]:
def testDropRowsIfMissingMetadata():
    v = np.reshape( np.array( np.arange(1,18 + 1)), (6,3) )
    testExpDF = pd.DataFrame(v
                       ,columns=['a', 'b', 'c' ],
                       index=['m', 'n', 'o', 'p', 'q', 'r' ]
                      )
    #print(testExpDF)
    display(HTML(testExpDF.to_html()))


    # testPheDF = pd.DataFrame(aedwp,
    #                    columns=['category', 'disease', 'foo' ],
    #                    index=['m', 'n', 'o', 'p', ]
    #                   )

    testPheDF = phenoTypeDF.iloc[0:6, [0, 1, 2] ].copy(deep=True)
    # display(HTML(testPheDF.to_html()))

    testPheDF.iloc[[1,3], 0] = np.NaN
    testPheDF.iloc[[2], 1] = np.NaN
    display(HTML(testPheDF.to_html()))

    print('\n\n\n\n ********** BEGIN test ******')
    cleanGeneExprDF, cleanPhenoDF, nullRowIdx = dropRowsIfMissingMetadata(testExpDF, testPheDF)
    display(HTML(cleanGeneExprDF.to_html()))
    display(HTML(cleanPhenoDF.to_html()))
    print('nullRowIdx:{}'.format(nullRowIdx))
    
# testDropRowsIfMissingMetadata()    

In [15]:
# we transpose the expression data so that it is in a machine learning
# friendly shape. 
numGeneExpr, numSamples,= expressionDF.shape
print("before transpose numGeneExpr:{}, numSamples:{} ".format(numGeneExpr, numSamples))

expressionDF = expressionDF.transpose()
numGeneExpr, numSamples, = expressionDF.shape
print("after  transpose numGeneExpr:{}, numSamples:{} ".format(numGeneExpr, numSamples))

print()
print("phenoTypeDF.shape before drop:{}".format(phenoTypeDF.shape))
expressionDF, phenoTypeDF, nullRowIdx = dropRowsIfMissingMetadata(expressionDF, phenoTypeDF)
numGeneExpr, numSamples, = expressionDF.shape
print("expressionDF after drop numGeneExpr:{}, numSamples:{} ".format(numGeneExpr, numSamples))
print("phenoTypeDF.shape after   drop:{}".format(phenoTypeDF.shape))
print("nullRowIdx:{}".format(nullRowIdx))

before transpose numGeneExpr:60498, numSamples:19131 
after  transpose numGeneExpr:19131, numSamples:60498 

phenoTypeDF.shape before drop:(19131, 7)
expressionDF after drop numGeneExpr:19130, numSamples:60498 
phenoTypeDF.shape after   drop:(19130, 7)
nullRowIdx:[8727]


# Export

In [16]:
def saveTCGA_Target_GtextDataFrames(dataDir, exprDF, labelDF):
    '''
    export data in h5 file format. exprDF will be transposed before write
    arguments:
        dataDir:
            directory on local system to write file to
            type: Path
            example "../data"
            
        exprDF, labelDF
            type: pandas data frames
            
    returns:
        tcgaTargetGtexPath:
            type: Path
    '''
    tcgaTargetGtexPath = dataDir.joinpath("tcga_target_gtex.h5")

    with pd.HDFStore(tcgaTargetGtexPath, "w") as store:
        store["expression"] = exprDF.sort_index(axis="columns")
        store["labels"] = labelDF.astype(str)

    return tcgaTargetGtexPath

tcgaTargetGtexPath = saveTCGA_Target_GtextDataFrames(dataDir, expressionDF, phenoTypeDF)
print(tcgaTargetGtexPath)

../data/tcga_target_gtex.h5


## Test end to end load

In [25]:
def loadTCGA_Target_GTex(dataDir):
    '''
    Download gene expression and clinical data from the UCSC Xena Toil re-compute 
    dataset wrangle, and store in an hdf5 file for quick loading machine learning. 
    This dataset comprises gene expression data for twenty thousand tumor and 
    normal samples processed using the exact same genomics pipeline and therefore 
    can be compared to each other.

    Each of the source data set consists of a float vector, log2(TPM+0.001) 

    samples missing either the 'category' or 'disease'labels are dropped. 
    There should not be an NaN values

    The first time you run this function will be slow. On a fast machine it may take
    20 min. It will down load data files from xena and run pipe like 
    
    The only file you need locally is tcga_target_gtex.h5
    
    arguments:
        dataDir: a Path object. E.G ../data
        
    returns:XDF, YDF objects of type pandas DataFrame
        XDF: 
            numberOfSamples = 19,130
            number of gene expression features = 60,498
            
        YDF:
            numberOfSamples = 19,130
            number of label features = 7
        

    you can delete the following files to save a little disk space

    ```
    cd data
    rm TcgaTargetGTEX_phenotype.txt.gz
    rm TcgaTargetGtex_rsem_gene_tpm.gz 
    rm TcgaTargetGtex_rsem_gene_tpm.h5
    ```

    TODO: 
    explore gene expression values. 
    if a features has a large number of log2(0.001) values it means the original 
    data had zero TPM
    
   refferences:
        - https://xenabrowser.net/datapages/?host=https://toil.xenahubs.net
        - https://github.com/rcurrie/tumornormal/blob/master/ingest.ipynb
    '''
    
    tcgaTargetGtexPath = dataDir.joinpath("tcga_target_gtex.h5")
    if tcgaTargetGtexPath.exists() :
        XDF = pd.read_hdf(tcgaTargetGtexPath, "expression")
        YDF = pd.read_hdf(tcgaTargetGtexPath, "labels")
        
    else:
        rawDataPath = downLoadGeneExpressionFromXenaHub(dataDir)
        print(rawDataPath)

        XDF= loadRawExpressionDF(dataDir, rawDataPath)

        phenoTypePath = downLoadPhenotypeFromXenaHub(dataDir)
        print(phenoTypePath)

        YDF = loadRawPhenotypeDF(dataDir, phenoTypePath)
        
        # shape must match Y, we also want in a ML friend format
        # i.e. rows are samples
        XDF = XDF.transpose()
        
        XDF, YDF, nullRowIdx = dropRowsIfMissingMetadata(XDF, YDF)
        print("nullRowIdx:{}".format(nullRowIdx))

        tcgaTargetGtexPath = saveTCGA_Target_GtextDataFrames(dataDir, XDF, YDF)
        print(tcgaTargetGtexPath)
        
    return(XDF, YDF)

In [26]:
%%time
XDF, YDF = loadTCGA_Target_GTex(dataDir)
numGeneExpr, numSamples, = XDF.shape
print("XDF after drop numGeneExpr:{}, numSamples:{} ".format(numGeneExpr, numSamples))
print("YDF.shape after   drop:{}".format(YDF.shape))

Downloading TCGA, TARGET and GTEX expression data from UCSC Xena
url:https://toil.xenahubs.net/download/TcgaTargetGtex_rsem_gene_tpm.gz
../data/TcgaTargetGtex_rsem_gene_tpm.gz
sourceFile:../data/TcgaTargetGtex_rsem_gene_tpm.h5
Converting expression to dataframe and storing in hdf5 file
Downloading https://toil.xenahubs.net/download/TcgaTargetGTEX_phenotype.txt.gz
../data/TcgaTargetGTEX_phenotype.txt.gz
nullRowIdx:[8727]
../data/tcga_target_gtex.h5
XDF after drop numGeneExpr:19130, numSamples:60498 
YDF.shape after   drop:(19130, 7)
CPU times: user 11min 50s, sys: 50.8 s, total: 12min 41s
Wall time: 14min 35s
