In [143]:
import pandas as pd
import json
import gzip
import os
import numpy 
import sys
numpy.set_printoptions(threshold=sys.maxsize)

# Getting the mapping json

In [51]:
df = pd.read_json('files-cases.json')
df.head()

Unnamed: 0,file_name,data_format,access,data_category,file_size,cases,annotations
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,TXT,open,Transcriptome Profiling,257725,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,TXT,open,Transcriptome Profiling,256496,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,TXT,open,Transcriptome Profiling,252976,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",
3,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,TXT,open,Transcriptome Profiling,250749,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",
4,aa623193-428e-41b8-b051-2d9693d852f8.htseq.cou...,TXT,open,Transcriptome Profiling,259222,"[{'project': {'project_id': 'TCGA-BRCA'}, 'cas...",


In [52]:
def extract_case_id(row):
    cases = row['cases']
    return cases[0]['case_id']
    

df['case_id'] = df.apply(extract_case_id, axis=1)

In [54]:
mapping_df = df[['file_name', 'case_id']]

In [55]:
mapping_df.head()

Unnamed: 0,file_name,case_id
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948
3,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,1285eb55-415c-494a-aa58-936f0427cdd0
4,aa623193-428e-41b8-b051-2d9693d852f8.htseq.cou...,6e126b73-d3e8-4641-a128-306f3b313e40


In [56]:
mapping_df.shape

(1164, 2)

In [57]:
len(set(mapping_df['case_id']))

1036

Although it has 1164 lines, it only says about 1036 patients, because some patients have multiple files

# Getting the clinical info


In [58]:
clinical = pd.read_csv('clinical.tsv', sep='\t')
clinical.head()

Unnamed: 0,case_id,submitter_id,project_id,gender,year_of_birth,race,days_to_birth,ethnicity,vital_status,days_to_death,...,treatment_effect,initial_disease_status,treatment_type,therapeutic_agents,regimen_or_line_of_therapy,treatment_intent_type,treatment_anatomic_site,treatment_outcome,days_to_treatment_end,treatment_or_therapy
0,178b2c48-c07d-422e-ae17-8bcfd996ad51,TCGA-B6-A0X1,TCGA-BRCA,female,1946,white,-17624,not hispanic or latino,Dead,7455,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,yes
1,d071c16b-7cee-45ed-8ec9-612418143815,TCGA-A2-A0CS,TCGA-BRCA,female,1931,white,-26817,not hispanic or latino,Dead,2348,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,yes
2,9ddc3e7b-8b54-4a83-8335-8053940f56c1,TCGA-BH-A18L,TCGA-BRCA,female,1953,white,-18519,not hispanic or latino,Dead,811,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,not reported
3,30ec8b1f-28c4-4f46-8a1b-a8d51e558c7d,TCGA-BH-A18T,TCGA-BRCA,female,1933,white,-25674,not hispanic or latino,Dead,224,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,not reported
4,7dcf550c-90ce-4f63-aecd-0e46897e2a3e,TCGA-AC-A23H,TCGA-BRCA,female,1919,white,-32872,not hispanic or latino,Dead,0,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,no


In [59]:
type(clinical['case_id'][0])

str

In [60]:
print(mapping_df.shape)
print(clinical.shape)

(1164, 2)
(1036, 42)


In [144]:
clinical.columns

Index(['case_id', 'submitter_id', 'project_id', 'gender', 'year_of_birth',
       'race', 'days_to_birth', 'ethnicity', 'vital_status', 'days_to_death',
       'age_at_index', 'year_of_death', 'year_of_diagnosis',
       'classification_of_tumor', 'last_known_disease_status',
       'primary_diagnosis', 'tumor_stage', 'age_at_diagnosis', 'morphology',
       'days_to_last_known_disease_status', 'prior_treatment',
       'days_to_recurrence', 'tumor_grade', 'icd_10_code', 'days_to_diagnosis',
       'tissue_or_organ_of_origin', 'progression_or_recurrence',
       'prior_malignancy', 'synchronous_malignancy',
       'site_of_resection_or_biopsy', 'days_to_last_follow_up',
       'days_to_treatment_start', 'treatment_effect', 'initial_disease_status',
       'treatment_type', 'therapeutic_agents', 'regimen_or_line_of_therapy',
       'treatment_intent_type', 'treatment_anatomic_site', 'treatment_outcome',
       'days_to_treatment_end', 'treatment_or_therapy'],
      dtype='object')

In [61]:
joined = pd.merge(mapping_df, clinical, how='inner', on="case_id")
joined.head()

Unnamed: 0,file_name,case_id,submitter_id,project_id,gender,year_of_birth,race,days_to_birth,ethnicity,vital_status,...,treatment_effect,initial_disease_status,treatment_type,therapeutic_agents,regimen_or_line_of_therapy,treatment_intent_type,treatment_anatomic_site,treatment_outcome,days_to_treatment_end,treatment_or_therapy
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1,TCGA-AR-A252,TCGA-BRCA,female,1957,white,-18611,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d,TCGA-AO-A1KS,TCGA-BRCA,female,1941,white,-25230,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,TCGA-E9-A1RF,TCGA-BRCA,female,1943,white,-25119,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
3,94bedc10-62c2-4bd2-bad9-0ec08c9bf5e9.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,TCGA-E9-A1RF,TCGA-BRCA,female,1943,white,-25119,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
4,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,1285eb55-415c-494a-aa58-936f0427cdd0,TCGA-V7-A7HQ,TCGA-BRCA,female,1934,black or african american,-27684,not hispanic or latino,Alive,...,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,yes,--


In [62]:
joined.shape

(1164, 43)

Perfect, we got all the files_names

# Getting the actual files

In [76]:
rootdir = 'files'
extract_files = False
if extract_files:
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if not file.startswith('.') and file != 'MANIFEST.txt':
                print(os.path.join(subdir, file))
                print(os.path.join(rootdir, file))
                os.rename(os.path.join(subdir, file), os.path.join(rootdir, file))

files/00156c51-2686-4736-88c1-c30c81ec0a12/2807d45f-9e15-4e06-8336-de7295fc7509.htseq.counts.gz
files/2807d45f-9e15-4e06-8336-de7295fc7509.htseq.counts.gz
files/001fe63b-06c8-4ca1-861a-3d1dd0fdf5fc/b4474f7c-f371-4ecb-8c2f-cd7b5d5d243b.htseq.counts.gz
files/b4474f7c-f371-4ecb-8c2f-cd7b5d5d243b.htseq.counts.gz
files/002c6f93-606c-4d4a-855a-aaf52b035c29/6d27be54-6210-431e-b460-247587c5ab8d.htseq.counts.gz
files/6d27be54-6210-431e-b460-247587c5ab8d.htseq.counts.gz
files/002c7ead-0e16-4a3f-ae29-5ffcfdcd2e3a/c45fe186-f0ba-4017-a8f6-a511c1b000eb.htseq.counts.gz
files/c45fe186-f0ba-4017-a8f6-a511c1b000eb.htseq.counts.gz
files/00331c0a-23f0-4724-9c0f-5e37f971f26f/b2a6c9e3-65eb-43bd-849a-5829007379ab.htseq.counts.gz
files/b2a6c9e3-65eb-43bd-849a-5829007379ab.htseq.counts.gz
files/00af6745-18ba-4c18-9b9e-95e79d5915b8/02759563-fa2b-4d96-ab78-dd7fee7513d4.htseq.counts.gz
files/02759563-fa2b-4d96-ab78-dd7fee7513d4.htseq.counts.gz
files/00c05634-5eb9-4555-885c-4b24cfc0c8fa/d8fea32d-3ef0-4d7b-8b3e-761

files/1129869a-70ba-476c-a162-96a13f94df3e/0827bd64-de4e-45bc-9e75-a9bdce41f4a5.htseq.counts.gz
files/0827bd64-de4e-45bc-9e75-a9bdce41f4a5.htseq.counts.gz
files/1145f14e-29f4-4bc5-a830-818a45260f01/9ad28472-32f0-4cb5-ba51-17d3e75fe85d.htseq.counts.gz
files/9ad28472-32f0-4cb5-ba51-17d3e75fe85d.htseq.counts.gz
files/114f9d06-c1f3-43ca-9188-d64269898250/1b70d7d9-1d4f-4912-820a-e7ccaa0c9038.htseq.counts.gz
files/1b70d7d9-1d4f-4912-820a-e7ccaa0c9038.htseq.counts.gz
files/11858517-5057-4a31-aea5-ac65be074443/01a1c77a-3f93-4561-a079-c771ac054f56.htseq.counts.gz
files/01a1c77a-3f93-4561-a079-c771ac054f56.htseq.counts.gz
files/11ca0a93-72ae-487f-aec5-3d56194e2851/04c61105-b954-48fe-bfea-3018dd304295.htseq.counts.gz
files/04c61105-b954-48fe-bfea-3018dd304295.htseq.counts.gz
files/1216fdbd-76aa-46a8-b5ee-5d1954677cd9/7487d66f-3ca5-4df9-ac9e-88f0af7838c3.htseq.counts.gz
files/7487d66f-3ca5-4df9-ac9e-88f0af7838c3.htseq.counts.gz
files/12f2970e-36d3-483d-9cd0-cc34df10701f/aab22bf4-8c7f-46b5-8170-844

files/3fe00aee-e683-4fd3-ab13-83a8a9d1cf0f/61d9afeb-0da7-4f13-973a-2c98e03038e4.htseq.counts.gz
files/61d9afeb-0da7-4f13-973a-2c98e03038e4.htseq.counts.gz
files/3ff20106-2954-4e42-9d50-480b78abf5ce/70089c01-2fbe-4501-bccf-747b40872d6a.htseq.counts.gz
files/70089c01-2fbe-4501-bccf-747b40872d6a.htseq.counts.gz
files/4004a263-47ce-47e7-bbe7-451e7dc2d8e5/f4bb4f4b-2e42-4098-9edf-042f1fffea0f.htseq.counts.gz
files/f4bb4f4b-2e42-4098-9edf-042f1fffea0f.htseq.counts.gz
files/404fe012-11ca-415c-8e6a-178adcbc0358/220c7593-ae84-4db4-9ce5-c6372fd696e3.htseq.counts.gz
files/220c7593-ae84-4db4-9ce5-c6372fd696e3.htseq.counts.gz
files/4055e254-f848-4439-a6fe-95160cbd3fa7/ff850c31-bb15-47a2-bd3d-2590ca013e68.htseq.counts.gz
files/ff850c31-bb15-47a2-bd3d-2590ca013e68.htseq.counts.gz
files/40b6f6f4-05f8-489d-bec7-2e567bd5776f/4c05f1ea-cd90-4a8c-b445-fb60850144d1.htseq.counts.gz
files/4c05f1ea-cd90-4a8c-b445-fb60850144d1.htseq.counts.gz
files/40bb5f8c-33ba-4ad2-871f-b5b1fb6f08f9/5d4c3e4c-0f1b-4e23-af3a-764

files/5a37624c-ffba-479d-a52c-495afeba642f/9350b57b-9f7b-43c6-a42b-9ddae857af0a.htseq.counts.gz
files/9350b57b-9f7b-43c6-a42b-9ddae857af0a.htseq.counts.gz
files/5a37bb9e-c242-465f-8331-b2d30fc10a22/8c573441-151e-40bc-bb2a-18d6ec2b4ee5.htseq.counts.gz
files/8c573441-151e-40bc-bb2a-18d6ec2b4ee5.htseq.counts.gz
files/5a440e74-1b65-4e07-8ab3-2ecc194e6d63/1be91780-e668-4261-bd65-ee03f2a5a1e7.htseq.counts.gz
files/1be91780-e668-4261-bd65-ee03f2a5a1e7.htseq.counts.gz
files/5a846035-824d-4fe1-a5dc-80554ae642b3/0404b823-fd3c-47dd-9861-0f9f01a02d2e.htseq.counts.gz
files/0404b823-fd3c-47dd-9861-0f9f01a02d2e.htseq.counts.gz
files/5ad374d9-b544-4ea7-81b7-289b5e0c9a43/e9dd15eb-0841-4c8a-a6f1-3285cb1703dc.htseq.counts.gz
files/e9dd15eb-0841-4c8a-a6f1-3285cb1703dc.htseq.counts.gz
files/5bb779ce-ad42-4e24-956c-7dc46bb03b74/87e68da8-743f-40f7-8894-fb66e474bb61.htseq.counts.gz
files/87e68da8-743f-40f7-8894-fb66e474bb61.htseq.counts.gz
files/5c38a2c0-e614-401d-b434-b9c627d4185c/9410727e-c872-4d2b-bba9-1f6

files/7d9ca5c9-ab27-4d36-8980-b2d803f23e62/0733fd68-9cd5-440a-a849-fb14ef76c00d.htseq.counts.gz
files/0733fd68-9cd5-440a-a849-fb14ef76c00d.htseq.counts.gz
files/7da031d1-04f3-407b-af7b-61bf4edd38e7/9f0897a4-db0c-43f8-9d75-8debe9b6c847.htseq.counts.gz
files/9f0897a4-db0c-43f8-9d75-8debe9b6c847.htseq.counts.gz
files/7da031d1-04f3-407b-af7b-61bf4edd38e7/annotations.txt
files/annotations.txt
files/7da0998c-2cd9-4c22-bf95-0f78f4a0c6dc/dfed7532-df6b-4ae6-8a7c-68bd89affa78.htseq.counts.gz
files/dfed7532-df6b-4ae6-8a7c-68bd89affa78.htseq.counts.gz
files/7ded636b-0e6a-4119-8c65-fe0bd61c9a3f/b576af04-e163-4b7d-be4a-eff959d214dd.htseq.counts.gz
files/b576af04-e163-4b7d-be4a-eff959d214dd.htseq.counts.gz
files/7e143fc6-2e77-412a-80f5-0a85fa1e19d9/20ff57b5-eeb2-4312-a5e0-11ac8c8d573a.htseq.counts.gz
files/20ff57b5-eeb2-4312-a5e0-11ac8c8d573a.htseq.counts.gz
files/7e3320d7-92f3-492a-80fd-29410cc2d018/981c01e0-2914-46db-b3b1-906bda5d7382.htseq.counts.gz
files/981c01e0-2914-46db-b3b1-906bda5d7382.htseq

files/9712e8de-eee1-4495-b00e-c2bef627abd0/bdceb6c5-77d1-43e6-9046-1340502c2a60.htseq.counts.gz
files/bdceb6c5-77d1-43e6-9046-1340502c2a60.htseq.counts.gz
files/9817b10e-80a7-4137-a205-d119df0870f6/036e2fc1-c001-40d5-98c4-28bd121b9554.htseq.counts.gz
files/036e2fc1-c001-40d5-98c4-28bd121b9554.htseq.counts.gz
files/987bf9eb-5619-405e-a326-6de9a2554fcc/7133f91c-4128-4578-a1cd-2c1ee81b3f16.htseq.counts.gz
files/7133f91c-4128-4578-a1cd-2c1ee81b3f16.htseq.counts.gz
files/98c51e97-f7bb-4874-91c4-e59e47200c02/3ca8233e-434b-477c-8683-b1f770cedc63.htseq.counts.gz
files/3ca8233e-434b-477c-8683-b1f770cedc63.htseq.counts.gz
files/99131e7b-2028-4048-86df-65491768abb0/6c60605b-a647-481e-976d-f2945740faf8.htseq.counts.gz
files/6c60605b-a647-481e-976d-f2945740faf8.htseq.counts.gz
files/9947ce54-fc42-4f49-878d-00c1de395cd5/ecbe1717-b6b7-4359-b723-3bfc81c82f2d.htseq.counts.gz
files/ecbe1717-b6b7-4359-b723-3bfc81c82f2d.htseq.counts.gz
files/994eebf4-fbf8-4e42-86e7-850bf4bd8363/de704076-e915-4749-9729-e6e

files/a71e6799-233c-4fba-b4dd-0e98c9b0a3dc/cf7e72eb-b82f-44c8-894c-3995571ffe81.htseq.counts.gz
files/cf7e72eb-b82f-44c8-894c-3995571ffe81.htseq.counts.gz
files/a7667d1e-b3f1-48d3-bf37-7f34770b0110/34dd0e22-1014-4fe5-8392-faddf79b0ad2.htseq.counts.gz
files/34dd0e22-1014-4fe5-8392-faddf79b0ad2.htseq.counts.gz
files/a775df54-a60e-4fb9-b05d-19cde289c0fd/c0a8b0f3-7a09-4be4-ae22-a00df281b783.htseq.counts.gz
files/c0a8b0f3-7a09-4be4-ae22-a00df281b783.htseq.counts.gz
files/a7fd8e17-239e-4ec3-8467-3e39b932d313/0abe1897-c3a9-47e5-8a59-e5a2232b23db.htseq.counts.gz
files/0abe1897-c3a9-47e5-8a59-e5a2232b23db.htseq.counts.gz
files/a835fa65-af49-43d9-9282-6c8cb922a5cd/d578e27f-537c-4aaa-8903-6ffe68346276.htseq.counts.gz
files/d578e27f-537c-4aaa-8903-6ffe68346276.htseq.counts.gz
files/a850150c-8cf8-4a56-b8ef-f3598527923e/793d9283-e8c4-4548-8f2f-b57f94308b5c.htseq.counts.gz
files/793d9283-e8c4-4548-8f2f-b57f94308b5c.htseq.counts.gz
files/a85171d0-71e4-42b2-9d09-38eb164e9169/9c67b11d-1bac-446f-8bd7-ca3

files/bbc16d75-f438-4035-b2ab-bd8b16ba8e74/85a57663-89b0-4cc6-b118-98fce8640eb2.htseq.counts.gz
files/85a57663-89b0-4cc6-b118-98fce8640eb2.htseq.counts.gz
files/bbc16d75-f438-4035-b2ab-bd8b16ba8e74/annotations.txt
files/annotations.txt
files/bc03e189-ee0d-4fd7-b15c-aa66eb1e8bd3/f02c6de4-67f9-465e-bbc1-3d48ae377313.htseq.counts.gz
files/f02c6de4-67f9-465e-bbc1-3d48ae377313.htseq.counts.gz
files/bc3a905f-de97-4112-b7d1-45efdf6346a4/71c9e9d3-9276-468c-a3a0-053880a75511.htseq.counts.gz
files/71c9e9d3-9276-468c-a3a0-053880a75511.htseq.counts.gz
files/bc571e05-b9b7-4ef1-98db-c05fdb94aa80/6e0e0989-0b95-4f1f-8853-2da52fd6a6fd.htseq.counts.gz
files/6e0e0989-0b95-4f1f-8853-2da52fd6a6fd.htseq.counts.gz
files/bc68952d-15cd-4f60-a425-b795ca5a9de8/57b2e9d8-ddae-4457-9959-0d911288e8dc.htseq.counts.gz
files/57b2e9d8-ddae-4457-9959-0d911288e8dc.htseq.counts.gz
files/bc84e10d-7cbe-49e1-b89a-74c3c4ecbac7/9b6df35c-cc33-4935-b321-537e0fd29f37.htseq.counts.gz
files/9b6df35c-cc33-4935-b321-537e0fd29f37.htseq

files/c8ac0300-4ea5-4315-a8e8-aa69552ce03c/bd556315-a050-465f-9a41-bdebe7e3eb61.htseq.counts.gz
files/bd556315-a050-465f-9a41-bdebe7e3eb61.htseq.counts.gz
files/c8ed8155-7625-47fc-84d2-69ff79501fbb/cbfa8434-6acf-4122-ab58-991efcc6aa6b.htseq.counts.gz
files/cbfa8434-6acf-4122-ab58-991efcc6aa6b.htseq.counts.gz
files/c97b954e-b066-4983-bd3a-3b45955bce7e/3846d74d-423d-43c2-a33d-168456d67478.htseq.counts.gz
files/3846d74d-423d-43c2-a33d-168456d67478.htseq.counts.gz
files/ca17d4a2-2e8a-4f60-9883-7ad7196f2878/f8e707b4-a897-4f17-a605-30d5be78ae93.htseq.counts.gz
files/f8e707b4-a897-4f17-a605-30d5be78ae93.htseq.counts.gz
files/ca2da18b-7519-457a-9e3b-4e4732744669/7a494c60-48a3-486a-83c2-aefb4c160a2c.htseq.counts.gz
files/7a494c60-48a3-486a-83c2-aefb4c160a2c.htseq.counts.gz
files/ca628427-168b-4091-bee5-ae54ac79108a/a3b9677e-a154-41f1-857c-4cd5244b77c3.htseq.counts.gz
files/a3b9677e-a154-41f1-857c-4cd5244b77c3.htseq.counts.gz
files/caaf288e-7672-46af-bc68-302afa0c6478/774b7b7e-f349-495e-8972-cbc

files/dcdc821f-33dd-41fe-9c3b-50e4e66690df/b493310b-77e6-43fb-96cd-ab56d12de273.htseq.counts.gz
files/b493310b-77e6-43fb-96cd-ab56d12de273.htseq.counts.gz
files/dce06602-a4bc-4103-915a-c9475f157006/a8a58442-78f5-4876-b25e-c04339eb6f26.htseq.counts.gz
files/a8a58442-78f5-4876-b25e-c04339eb6f26.htseq.counts.gz
files/dce7b9c0-dd83-4d7f-868c-e4fe77d933e3/5d6adefb-ceda-4ef2-a6b9-8077fd618ae6.htseq.counts.gz
files/5d6adefb-ceda-4ef2-a6b9-8077fd618ae6.htseq.counts.gz
files/dcff2072-c1f4-4b0f-9cda-8a57ef192499/71c3f9c6-9226-4a8f-910a-3cee62dbeb3c.htseq.counts.gz
files/71c3f9c6-9226-4a8f-910a-3cee62dbeb3c.htseq.counts.gz
files/dd573821-1915-469c-ab02-16d50363771e/41263b0e-e2db-4ecf-8152-136364e1ad66.htseq.counts.gz
files/41263b0e-e2db-4ecf-8152-136364e1ad66.htseq.counts.gz
files/ddbd9626-593f-40ac-9246-f5291ebe1fac/79b3f166-d05a-4a27-ac51-31a6e36bd349.htseq.counts.gz
files/79b3f166-d05a-4a27-ac51-31a6e36bd349.htseq.counts.gz
files/ddc86ba3-edaa-40ad-8417-a513674bab77/dea7fd8b-f6c6-4208-861f-0da

files/e9522de5-8c32-45f4-b7ab-9ce895dd94bf/88ac1ce3-6538-4bf2-960c-80aebb70a148.htseq.counts.gz
files/88ac1ce3-6538-4bf2-960c-80aebb70a148.htseq.counts.gz
files/e972ee01-935f-4fff-b9cc-57acbe53876a/a60ed4b8-a041-4b38-b961-fc976b254702.htseq.counts.gz
files/a60ed4b8-a041-4b38-b961-fc976b254702.htseq.counts.gz
files/e986b467-e64a-4ce2-8b18-3ac29fc09278/efab89c4-921c-4b89-9bff-78c1cf10417d.htseq.counts.gz
files/efab89c4-921c-4b89-9bff-78c1cf10417d.htseq.counts.gz
files/e98c8870-af34-413d-8e20-1f81beed5e22/55ef3047-ebc1-4b33-be74-b94dcf38988f.htseq.counts.gz
files/55ef3047-ebc1-4b33-be74-b94dcf38988f.htseq.counts.gz
files/ea18cd7c-d67b-423e-a145-75522d686fe6/bf9e1674-9f2c-4014-808a-1f3f7544e9df.htseq.counts.gz
files/bf9e1674-9f2c-4014-808a-1f3f7544e9df.htseq.counts.gz
files/ea1cda6a-d54d-43d9-b0b9-e7b707f9b611/0e397288-82e4-48c7-9332-2ce733befdae.htseq.counts.gz
files/0e397288-82e4-48c7-9332-2ce733befdae.htseq.counts.gz
files/ea3aa827-70f9-4455-bed4-d699ed97517a/edc3bfaa-aad7-44d5-8aa0-bb5

In [88]:
test = pd.read_csv('files/0a2c1866-d6e1-4add-bcb0-b297ba1394ac.htseq.counts', sep='\t', header=None, names=['gen', 'count'])
test['count'] = test['count'].astype('float')
test.head()

Unnamed: 0,gen,count,index
0,ENSG00000000003.13,2569.0,0
1,ENSG00000000005.5,1.0,1
2,ENSG00000000419.11,3180.0,2
3,ENSG00000000457.12,3332.0,3
4,ENSG00000000460.15,1621.0,4


In [90]:
gens = pd.pivot_table(test, values='count', columns=['gen'])
gens.head()

gen,ENSG00000000003.13,ENSG00000000005.5,ENSG00000000419.11,ENSG00000000457.12,ENSG00000000460.15,ENSG00000000938.11,ENSG00000000971.14,ENSG00000001036.12,ENSG00000001084.9,ENSG00000001167.13,...,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__alignment_not_unique,__ambiguous,__no_feature,__not_aligned,__too_low_aQual
count,2569.0,1.0,3180.0,3332.0,1621.0,530.0,7282.0,3312.0,2642.0,3322.0,...,0.0,0.0,0.0,0.0,0.0,23748640.0,3368739.0,3069305.0,0.0,0.0


In [100]:
gens.columns

Index(['ENSG00000000003.13', 'ENSG00000000005.5', 'ENSG00000000419.11',
       'ENSG00000000457.12', 'ENSG00000000460.15', 'ENSG00000000938.11',
       'ENSG00000000971.14', 'ENSG00000001036.12', 'ENSG00000001084.9',
       'ENSG00000001167.13',
       ...
       'ENSGR0000275287.3', 'ENSGR0000276543.3', 'ENSGR0000277120.3',
       'ENSGR0000280767.1', 'ENSGR0000281849.1', '__alignment_not_unique',
       '__ambiguous', '__no_feature', '__not_aligned', '__too_low_aQual'],
      dtype='object', name='gen', length=60488)

In [131]:
gens_dataframe = pd.DataFrame(columns=gens.columns)
for row in joined.iterrows():
    row = row[1]
    name = row['file_name']
    path_to_zip_file = os.path.join('files', name)
    with gzip.open(path_to_zip_file) as f:
        features = pd.read_csv(f, sep='\t', header=None, names=['gen', 'count'])
        features['count'] = features['count'].astype('float')
        pivot_df = pd.pivot_table(features, values='count', columns=['gen'])
        gens_dataframe = pd.concat([gens_dataframe, pivot_df], ignore_index=True)
gens_dataframe['file_name'] = joined['file_name']

In [132]:
gens_dataframe

gen,ENSG00000000003.13,ENSG00000000005.5,ENSG00000000419.11,ENSG00000000457.12,ENSG00000000460.15,ENSG00000000938.11,ENSG00000000971.14,ENSG00000001036.12,ENSG00000001084.9,ENSG00000001167.13,...,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__alignment_not_unique,__ambiguous,__no_feature,__not_aligned,__too_low_aQual,file_name
0,2608.0,289.0,1661.0,2081.0,684.0,1261.0,8544.0,4815.0,2879.0,4088.0,...,0.0,0.0,0.0,0.0,33323894.0,3232629.0,5808920.0,0.0,0.0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...
1,5789.0,34.0,2538.0,2572.0,1292.0,284.0,2934.0,5797.0,3401.0,5931.0,...,0.0,0.0,0.0,0.0,19183901.0,2673271.0,6146344.0,0.0,0.0,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...
2,4544.0,1881.0,1565.0,1356.0,294.0,1006.0,24121.0,3695.0,5097.0,2025.0,...,0.0,0.0,0.0,0.0,23874394.0,2400693.0,5278313.0,0.0,0.0,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...
3,676.0,41.0,3732.0,3155.0,1010.0,910.0,5382.0,3559.0,2968.0,3704.0,...,0.0,0.0,0.0,0.0,21737695.0,3123055.0,5182286.0,0.0,0.0,94bedc10-62c2-4bd2-bad9-0ec08c9bf5e9.htseq.cou...
4,2299.0,5.0,708.0,388.0,93.0,113.0,674.0,2219.0,871.0,923.0,...,0.0,0.0,0.0,0.0,20631886.0,2996831.0,2104393.0,0.0,0.0,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...
5,2944.0,183.0,2030.0,2840.0,1122.0,1305.0,7630.0,4291.0,3811.0,3248.0,...,0.0,0.0,0.0,0.0,23302643.0,3570401.0,6205707.0,0.0,0.0,aa623193-428e-41b8-b051-2d9693d852f8.htseq.cou...
6,1586.0,4.0,2467.0,2239.0,1332.0,331.0,2633.0,4770.0,3756.0,2637.0,...,0.0,0.0,0.0,0.0,13243714.0,2591421.0,3949353.0,0.0,0.0,6fdbbe35-6392-4100-8f96-0e0d02257b73.htseq.cou...
7,3113.0,27.0,1608.0,2371.0,501.0,1242.0,3113.0,2617.0,1347.0,2523.0,...,0.0,0.0,0.0,0.0,14861668.0,2448076.0,3203807.0,0.0,0.0,e023c283-afcf-4334-b741-3dce8e98b0f6.htseq.cou...
8,1314.0,412.0,1845.0,2111.0,710.0,1143.0,1562.0,2433.0,1408.0,2194.0,...,0.0,0.0,0.0,0.0,15248373.0,3228695.0,3913335.0,0.0,0.0,3c631c4d-ec26-4f29-abcb-bed2221f3da5.htseq.cou...
9,1784.0,23.0,3378.0,2040.0,569.0,460.0,3357.0,2069.0,2986.0,1558.0,...,0.0,0.0,0.0,0.0,13945258.0,2470019.0,2746243.0,0.0,0.0,4e2d8faa-1d4a-41d6-ad45-a8206cfd9c90.htseq.cou...


In [133]:
gens_dataframe.shape

(1164, 60489)

In [134]:
joined.head()

Unnamed: 0,file_name,case_id,submitter_id,project_id,gender,year_of_birth,race,days_to_birth,ethnicity,vital_status,...,treatment_effect,initial_disease_status,treatment_type,therapeutic_agents,regimen_or_line_of_therapy,treatment_intent_type,treatment_anatomic_site,treatment_outcome,days_to_treatment_end,treatment_or_therapy
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1,TCGA-AR-A252,TCGA-BRCA,female,1957,white,-18611,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d,TCGA-AO-A1KS,TCGA-BRCA,female,1941,white,-25230,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,TCGA-E9-A1RF,TCGA-BRCA,female,1943,white,-25119,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
3,94bedc10-62c2-4bd2-bad9-0ec08c9bf5e9.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,TCGA-E9-A1RF,TCGA-BRCA,female,1943,white,-25119,not hispanic or latino,Alive,...,--,yes,--,--,--,--,--,--,--,--
4,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,1285eb55-415c-494a-aa58-936f0427cdd0,TCGA-V7-A7HQ,TCGA-BRCA,female,1934,black or african american,-27684,not hispanic or latino,Alive,...,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,yes,--


# Putting everything together

In [135]:
joined = pd.merge(joined, gens_dataframe, how='inner', on="file_name")
joined.head()

Unnamed: 0,file_name,case_id,submitter_id,project_id,gender,year_of_birth,race,days_to_birth,ethnicity,vital_status,...,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__alignment_not_unique,__ambiguous,__no_feature,__not_aligned,__too_low_aQual
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1,TCGA-AR-A252,TCGA-BRCA,female,1957,white,-18611,not hispanic or latino,Alive,...,0.0,0.0,0.0,0.0,0.0,33323894.0,3232629.0,5808920.0,0.0,0.0
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d,TCGA-AO-A1KS,TCGA-BRCA,female,1941,white,-25230,not hispanic or latino,Alive,...,0.0,0.0,0.0,0.0,0.0,19183901.0,2673271.0,6146344.0,0.0,0.0
2,f5d3e683-9177-45fc-93e3-357bf7366ac4.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,TCGA-E9-A1RF,TCGA-BRCA,female,1943,white,-25119,not hispanic or latino,Alive,...,0.0,0.0,0.0,0.0,0.0,23874394.0,2400693.0,5278313.0,0.0,0.0
3,94bedc10-62c2-4bd2-bad9-0ec08c9bf5e9.htseq.cou...,c694615c-b1c6-499c-8058-995633ebf948,TCGA-E9-A1RF,TCGA-BRCA,female,1943,white,-25119,not hispanic or latino,Alive,...,0.0,0.0,0.0,0.0,0.0,21737695.0,3123055.0,5182286.0,0.0,0.0
4,6999d309-8502-49ee-8d80-9d0bde00081f.htseq.cou...,1285eb55-415c-494a-aa58-936f0427cdd0,TCGA-V7-A7HQ,TCGA-BRCA,female,1934,black or african american,-27684,not hispanic or latino,Alive,...,0.0,0.0,0.0,0.0,0.0,20631886.0,2996831.0,2104393.0,0.0,0.0


In [136]:
joined.shape

(1164, 60531)

In [146]:
joined['tumor_stage'].value_counts()

stage iib     46
stage iia     46
stage iiia    29
stage i       20
stage iv      16
              ..
24647          1
14454          1
23011          1
19723          1
14638          1
Name: tumor_stage, Length: 865, dtype: int64

In [137]:
joined.to_csv("output.csv")