In [9]:
import pandas as pd
import json
import gzip
import os
import numpy as np
import sys
# np.set_printoptions(threshold=sys.maxsize)

# Getting the mapping dataframe

In [10]:
df = pd.read_json('data/files-cases.json')
df.head()

Unnamed: 0,access,annotations,cases,data_category,data_format,file_name,file_size
0,open,,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",Transcriptome Profiling,TXT,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,257725
1,open,,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",Transcriptome Profiling,TXT,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,256496
2,open,,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",Transcriptome Profiling,TXT,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,252976
3,open,,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",Transcriptome Profiling,TXT,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,250749
4,open,,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",Transcriptome Profiling,TXT,aa623193-428e-41b8-b051-2d9693d852f8.htseq.cou...,259222


In [11]:
def extract_case_id(row):
    cases = row['cases']
    return cases[0]['case_id']
    

df['case_id'] = df.apply(extract_case_id, axis=1)

In [12]:
mapping_df = df[['file_name', 'case_id']]

### Here we have the mapping between patients and its files

In [13]:
mapping_df.head()

Unnamed: 0,file_name,case_id
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948
3,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,1285eb55-415c-494a-aa58-936f0427cdd0
4,aa623193-428e-41b8-b051-2d9693d852f8.htseq.cou...,6e126b73-d3e8-4641-a128-306f3b313e40


In [14]:
mapping_df.shape

(1164, 2)

In [15]:
len(set(mapping_df['case_id']))

1036

Although it has 1164 lines, it only says about 1036 patients, because some patients have multiple files

# Getting the clinical info


In [16]:
clinical = pd.read_json('data/clinical.json')
clinical.head()

Unnamed: 0,case_id,demographic,diagnoses,exposures
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,{'updated_datetime': '2019-04-28T14:06:27.1878...,"[{'year_of_diagnosis': 2010, 'classification_o...","[{'cigarettes_per_day': None, 'weight': None, ..."
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,{'updated_datetime': '2019-04-28T13:48:46.4035...,"[{'year_of_diagnosis': 1995, 'classification_o...","[{'cigarettes_per_day': None, 'weight': None, ..."
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,{'updated_datetime': '2019-04-28T13:49:33.3043...,"[{'year_of_diagnosis': 1994, 'classification_o...","[{'cigarettes_per_day': None, 'weight': None, ..."
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,{'updated_datetime': '2019-04-28T13:44:29.1433...,"[{'year_of_diagnosis': 1994, 'classification_o...","[{'cigarettes_per_day': None, 'weight': None, ..."
4,dddd8e2f-e540-418a-b02e-698d18a12c14,{'updated_datetime': '2019-04-28T13:41:37.6619...,"[{'year_of_diagnosis': 2011, 'classification_o...","[{'cigarettes_per_day': None, 'weight': None, ..."


In [17]:
len(clinical)

1036

In [18]:
len(clinical[clinical.isnull()])

1036

In [19]:
def get_nested(e, key):
    e_dict = e
    result = None
    if type(e) is dict:
        e_dict = np.array([e])
        
    try:
        result = e_dict[0][key]
    except KeyError:
        result = np.nan
        
    return result

columns = ['diagnoses', 'demographic', 'exposures']
for c in columns:
    column_dict = clinical[c][0]
    if type(column_dict) is dict:
        column_dict = np.array([column_dict])

    keys = list(column_dict[0].keys())
    for key in keys:
        clinical[key] = clinical[c].apply(get_nested, args=(key,))

    clinical = clinical.drop(columns=[c])

In [20]:
clinical.head()

Unnamed: 0,case_id,year_of_diagnosis,classification_of_tumor,last_known_disease_status,updated_datetime,primary_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,morphology,...,age_at_index,year_of_death,cigarettes_per_day,weight,alcohol_history,alcohol_intensity,bmi,years_smoked,exposure_id,height
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,2010.0,not reported,not reported,2019-04-28T14:06:27.187807-05:00,"Infiltrating duct carcinoma, NOS",TCGA-D8-A1JB_exposure,stage iib,19822.0,8500/3,...,54,,,,Not Reported,,,,7b540d88-61fe-57f9-9ef7-3a6f7cf16b03,
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,1995.0,not reported,not reported,2019-04-28T13:48:46.403564-05:00,"Lobular carcinoma, NOS",TCGA-B6-A0IE_exposure,stage iiia,13982.0,8520/3,...,38,2000.0,,,Not Reported,,,,eb7d7364-9c30-5eff-ac7f-7a892a09a697,
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,1994.0,not reported,not reported,2019-04-28T13:49:33.304314-05:00,Infiltrating duct and lobular carcinoma,TCGA-B6-A0RP_exposure,not reported,26941.0,8522/3,...,73,2002.0,,,Not Reported,,,,80d88a13-2338-5602-a74a-f4feba2d59c4,
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,1994.0,not reported,not reported,2019-04-28T13:44:29.143389-05:00,"Infiltrating duct carcinoma, NOS",TCGA-B6-A0X1_exposure,not reported,17624.0,8500/3,...,48,,,,Not Reported,,,,8bba7491-f249-50ec-9f98-8b654a270889,
4,dddd8e2f-e540-418a-b02e-698d18a12c14,2011.0,not reported,not reported,2019-04-28T13:41:37.661995-05:00,"Infiltrating duct carcinoma, NOS",TCGA-A7-A26H_exposure,stage iia,26423.0,8500/3,...,72,,,,Not Reported,,,,5e46a41a-dd57-59e0-89f8-c8dd97f23f43,


In [21]:
print(mapping_df.shape)
print(clinical.shape)

(1164, 2)
(1036, 43)


In [22]:
joined = pd.merge(mapping_df, clinical, how='inner', on="case_id")
joined.head()

Unnamed: 0,file_name,case_id,year_of_diagnosis,classification_of_tumor,last_known_disease_status,updated_datetime,primary_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,...,age_at_index,year_of_death,cigarettes_per_day,weight,alcohol_history,alcohol_intensity,bmi,years_smoked,exposure_id,height
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1,2007.0,not reported,not reported,2019-04-28T13:50:16.849041-05:00,"Infiltrating duct carcinoma, NOS",TCGA-AR-A252_exposure,stage i,18611.0,...,50,,,,Not Reported,,,,f6e4e643-657e-57a3-9c66-4c8560d10395,
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d,2010.0,not reported,not reported,2019-04-28T13:39:49.010685-05:00,"Lobular carcinoma, NOS",TCGA-AO-A1KS_exposure,stage iia,25230.0,...,69,,,,Not Reported,,,,9543b2f1-c629-530f-a639-720d6eca159f,
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,2011.0,not reported,not reported,2019-04-28T14:03:35.530385-05:00,"Infiltrating duct carcinoma, NOS",TCGA-E9-A1RF_exposure,stage iiia,25119.0,...,68,,,,Not Reported,,,,07f30986-78af-5047-bbec-b3c3fd62aab0,
3,94bedc10-62c2-4bd2-bad9-0ec08c9bf5e9.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,2011.0,not reported,not reported,2019-04-28T14:03:35.530385-05:00,"Infiltrating duct carcinoma, NOS",TCGA-E9-A1RF_exposure,stage iiia,25119.0,...,68,,,,Not Reported,,,,07f30986-78af-5047-bbec-b3c3fd62aab0,
4,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,1285eb55-415c-494a-aa58-936f0427cdd0,2009.0,not reported,not reported,2019-04-28T13:44:01.412721-05:00,"Infiltrating duct carcinoma, NOS",TCGA-V7-A7HQ_exposure,stage iiia,27684.0,...,75,,,,Not Reported,,,,38ea3492-d404-5921-8616-8db392fc5cea,


In [23]:
joined.shape

(1164, 44)

Perfect, we got all the files

# Getting the actual files

In [24]:
rootdir = 'data/files'
extract_files = False
if extract_files:
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if not file.startswith('.') and file != 'MANIFEST.txt':
                os.rename(os.path.join(subdir, file), os.path.join(rootdir, file))

In [25]:
test = pd.read_csv('data/files/0a2c1866-d6e1-4add-bcb0-b297ba1394ac.htseq.counts', sep='\t', header=None, names=['gen', 'count'])
test['count'] = test['count'].astype('float')
gens = pd.pivot_table(test, values='count', columns=['gen'])


In [26]:
gens_dataframe = pd.DataFrame(columns=gens.columns)
for row in joined.iterrows():
    row = row[1]
    name = row['file_name']
    path_to_zip_file = os.path.join('files', name)
    with gzip.open(path_to_zip_file) as f:
        features = pd.read_csv(f, sep='\t', header=None, names=['gen', 'count'])
        features['count'] = features['count'].astype('float')
        pivot_df = pd.pivot_table(features, values='count', columns=['gen'])
        gens_dataframe = pd.concat([gens_dataframe, pivot_df], ignore_index=True)
gens_dataframe['file_name'] = joined['file_name']

In [27]:
gens_dataframe.head()

gen,ENSG00000000003.13,ENSG00000000005.5,ENSG00000000419.11,ENSG00000000457.12,ENSG00000000460.15,ENSG00000000938.11,ENSG00000000971.14,ENSG00000001036.12,ENSG00000001084.9,ENSG00000001167.13,...,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__alignment_not_unique,__ambiguous,__no_feature,__not_aligned,__too_low_aQual,file_name
0,2608.0,289.0,1661.0,2081.0,684.0,1261.0,8544.0,4815.0,2879.0,4088.0,...,0.0,0.0,0.0,0.0,33323894.0,3232629.0,5808920.0,0.0,0.0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...
1,5789.0,34.0,2538.0,2572.0,1292.0,284.0,2934.0,5797.0,3401.0,5931.0,...,0.0,0.0,0.0,0.0,19183901.0,2673271.0,6146344.0,0.0,0.0,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...
2,4544.0,1881.0,1565.0,1356.0,294.0,1006.0,24121.0,3695.0,5097.0,2025.0,...,0.0,0.0,0.0,0.0,23874394.0,2400693.0,5278313.0,0.0,0.0,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...
3,676.0,41.0,3732.0,3155.0,1010.0,910.0,5382.0,3559.0,2968.0,3704.0,...,0.0,0.0,0.0,0.0,21737695.0,3123055.0,5182286.0,0.0,0.0,94bedc10-62c2-4bd2-bad9-0ec08c9bf5e9.htseq.cou...
4,2299.0,5.0,708.0,388.0,93.0,113.0,674.0,2219.0,871.0,923.0,...,0.0,0.0,0.0,0.0,20631886.0,2996831.0,2104393.0,0.0,0.0,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...


In [28]:
gens_dataframe.shape

(1164, 60489)

# Putting everything together

In [29]:
joined = pd.merge(joined, gens_dataframe, how='inner', on="file_name")
joined.head()

Unnamed: 0,file_name,case_id,year_of_diagnosis,classification_of_tumor,last_known_disease_status,updated_datetime,primary_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,...,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__alignment_not_unique,__ambiguous,__no_feature,__not_aligned,__too_low_aQual
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1,2007.0,not reported,not reported,2019-04-28T13:50:16.849041-05:00,"Infiltrating duct carcinoma, NOS",TCGA-AR-A252_exposure,stage i,18611.0,...,0.0,0.0,0.0,0.0,0.0,33323894.0,3232629.0,5808920.0,0.0,0.0
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d,2010.0,not reported,not reported,2019-04-28T13:39:49.010685-05:00,"Lobular carcinoma, NOS",TCGA-AO-A1KS_exposure,stage iia,25230.0,...,0.0,0.0,0.0,0.0,0.0,19183901.0,2673271.0,6146344.0,0.0,0.0
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,2011.0,not reported,not reported,2019-04-28T14:03:35.530385-05:00,"Infiltrating duct carcinoma, NOS",TCGA-E9-A1RF_exposure,stage iiia,25119.0,...,0.0,0.0,0.0,0.0,0.0,23874394.0,2400693.0,5278313.0,0.0,0.0
3,94bedc10-62c2-4bd2-bad9-0ec08c9bf5e9.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,2011.0,not reported,not reported,2019-04-28T14:03:35.530385-05:00,"Infiltrating duct carcinoma, NOS",TCGA-E9-A1RF_exposure,stage iiia,25119.0,...,0.0,0.0,0.0,0.0,0.0,21737695.0,3123055.0,5182286.0,0.0,0.0
4,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,1285eb55-415c-494a-aa58-936f0427cdd0,2009.0,not reported,not reported,2019-04-28T13:44:01.412721-05:00,"Infiltrating duct carcinoma, NOS",TCGA-V7-A7HQ_exposure,stage iiia,27684.0,...,0.0,0.0,0.0,0.0,0.0,20631886.0,2996831.0,2104393.0,0.0,0.0


In [30]:
joined.shape

(1164, 60532)

# Persist it

In [31]:
joined.to_csv("data/gathered.csv")