In [1]:
import os
import pandas as pd
import gzip
import numpy as np
import re
import math
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_dir = "../Data"
normal = dict()
tumor = dict()

cancertypes = ['BLCA','BRCA','CESC','CHOL','COAD','ESCA','GBM','HNSC','KICH',
               'KIRC','KIRP','LIHC','LUAD','LUSC','PAAD','PCPG','PRAD','READ',
               'SARC','SKCM','STAD','THCA','THYM','UCEC']

In [3]:
original_dir = "/".join((data_dir, "Original"))

filenames = [file for file in os.listdir(original_dir) if not re.search("DS_Store", file)]
samples = [file for file in filenames if re.search("Samples", file)]
FPKM = [file for file in filenames if re.search("FPKM", file)]
TPM = [file for file in filenames if re.search("TPM", file)]
feature_counts = [file for file in filenames if re.search("FeatureCounts", file)]

In [4]:
for file in samples:
    unzipped = gzip.open("/".join((original_dir, file)), "rb")
    if re.search("Clinical", file):
        df = pd.read_table(unzipped, header=0)
        df.set_index("Unnamed: 1", inplace=True)
        df.index.rename("Category", inplace=True)
        df = df.iloc[:, 2:].T
        clinical_df = df
        continue
    df = pd.read_table(unzipped, header=None)
    df.columns = ["Sample IDs", "Tumor Type"]
    df.set_index("Sample IDs", inplace=True) 
    if re.search("Normal|normal", file):
        print("Normal")
        normal["Samples"] = df
    else:
        print("Tumor")
        tumor["Samples"] = df
print("Finished")

for col in clinical_df.columns:
    if isinstance(clinical_df.loc[:, col].isnull().sum(), np.int64):
        if clinical_df.loc[:, col].isnull().sum()/len(clinical_df) >= 0.5:
            clinical_df.pop(col)
clinical_df.pop("form_completion_date")
clinical_df.pop("bcr_patient_barcode")
clinical_df.pop("bcr_patient_uuid")
print(clinical_df.shape)

Tumor
Normal
Finished
(9264, 83)


In [6]:
tumor["FPKM"]

Unnamed: 0,TCGA-02-0047-01A-01R-1849-01,TCGA-02-0055-01A-01R-1849-01,TCGA-02-2483-01A-01R-1849-01,TCGA-02-2485-01A-01R-1849-01,TCGA-02-2486-01A-01R-1849-01,TCGA-04-1331-01A-01R-1569-13,TCGA-04-1332-01A-01R-1564-13,TCGA-04-1337-01A-01R-1564-13,TCGA-04-1338-01A-01R-1564-13,TCGA-04-1341-01A-01R-1564-13,...,TCGA-ZP-A9D1-01A-11R-A38B-07,TCGA-ZP-A9D2-01A-11R-A38B-07,TCGA-ZP-A9D4-01A-11R-A37K-07,TCGA-ZQ-A9CR-01A-11R-A39E-31,TCGA-ZS-A9CD-01A-11R-A37K-07,TCGA-ZS-A9CE-01A-11R-A37K-07,TCGA-ZS-A9CF-01A-11R-A38B-07,TCGA-ZS-A9CF-02A-11R-A38B-07,TCGA-ZS-A9CG-01A-11R-A37K-07,TCGA-ZX-AA5X-01A-11R-A42T-07
1/2-SBSRNA4,0.846503,0.761133,1.109231,0.828669,0.671029,1.764617,0.791659,1.857101,0.844635,0.591414,...,0.165196,0.301100,0.403439,0.365219,0.128858,0.251469,0.897534,1.914650,0.252794,0.643081
A1BG,4.235534,11.458650,9.240365,2.078497,2.839130,9.716383,2.056253,4.145549,3.685768,8.824124,...,1104.475468,18.050358,847.959659,2.177044,1107.493240,2152.696424,244.418320,106.745160,288.189396,2.509107
A1BG-AS1,0.481752,0.502274,0.498373,0.287216,0.289895,1.421635,0.306629,0.858092,0.339544,0.271625,...,3.339989,0.186014,4.451023,0.174347,4.643205,5.006610,1.814663,0.894201,1.486858,0.654916
A1CF,0.002834,0.007166,0.002184,0.002378,0.002592,0.010185,0.013708,0.008478,0.017858,0.007919,...,23.108476,15.382501,28.292449,0.939900,14.929011,22.279771,30.719047,44.234010,29.884221,0.004306
A2LD1,1.097625,1.434916,0.652466,2.489825,2.884248,1.524512,1.430877,1.593137,1.846506,1.083138,...,2.598445,4.280606,1.825018,1.053480,3.211426,5.062030,4.690572,5.913174,3.515261,1.314383
A2M,382.452985,444.093808,260.616058,93.564245,418.461006,73.967894,139.932829,59.930344,40.065133,11.912447,...,398.501003,894.680242,67.395588,238.643352,556.031883,124.611245,249.572748,333.045422,79.218668,116.895631
A2ML1,0.493443,0.033955,0.065979,2.298351,1.105308,2.500998,1.846221,0.028243,1.721390,0.098499,...,0.012634,0.020149,0.000000,0.024439,0.018395,0.000000,0.004711,0.022478,0.008202,43.048220
A2MP1,0.089937,0.037906,0.138647,0.113197,0.102827,0.080811,0.195773,0.067264,0.106269,0.041890,...,0.169244,0.179946,0.107158,0.000000,0.082143,0.025047,0.525873,0.160596,0.109874,0.102486
A4GALT,1.122992,11.631572,2.407783,4.245703,0.991739,12.414737,17.957652,9.562194,5.328033,5.585281,...,3.076785,2.005846,1.025311,22.669573,10.846241,0.524854,7.607867,1.544297,4.015950,25.378075
A4GNT,0.060990,0.038559,0.023506,0.012794,0.013946,0.131524,0.103260,0.068422,0.048044,0.007102,...,0.012753,0.101692,0.012112,0.172685,0.000000,0.000000,0.028530,0.040840,0.000000,0.023167


In [5]:
for file in FPKM:
    unzipped = gzip.open("/".join((original_dir, file)), "rb")
    df = pd.read_table(unzipped, index_col=0)
    if re.search("Normal|normal", file):
        print("Normal")
        normal["FPKM"] = df
    else:
        print("Tumor")
        tumor["FPKM"] = df
print("Finished FPKM")

# for file in TPM:
#     unzipped = gzip.open("/".join((original_dir, file)), "rb")
#     df = pd.read_table(unzipped, index_col=0)
#     if re.search("Normal|normal", file):
#         print("Normal")
#         normal["FPKM"] = df
#     else:
#         print("Tumor")
#         tumor["FPKM"] = df
# print("Finished TPM")

# for file in feature_counts:
#     unzipped = gzip.open("/".join((original_dir, file)), "rb")
#     df = pd.read_table(unzipped, index_col=0)
#     if re.search("Normal|normal", file):
#         print("Normal")
#         normal["FPKM"] = df
#     else:
#         print("Tumor")
#         tumor["FPKM"] = df
# print("Finished TPM")

Tumor
Normal
Finished FPKM


In [None]:
normal_or_tumor = ["Tumor", "Normal"]

# cancertypes = ["COAD"]
# cancertypes = ["LIHC"]
# cancertypes = ["STAD"]
# cancertypes = ["COAD","LIHC", "STAD"]



datatype = "FPKM" #  ["FPKM", "FeatureCounts", "TPM"]
dataframe_dict = {"Normal": normal, "Tumor": tumor}

for tissue in normal_or_tumor:
    print("For %s" % tissue)
    df_dict = dataframe_dict[tissue]
    
    # Obtain tumor sample type
    new_df = df_dict["Samples"]
    print("Samples: %i" % len(new_df.columns))
    
    # Add clinical data
    if tissue == "Tumor": 
        print("Clinical: %i" % len(clinical_df.columns))
        new_df = new_df.join(clinical_df)
    
    # Add gene expression data
    df_fpkm = df_dict[datatype].T 
    print("%s: %i" % (datatype, len(df_fpkm.columns)))
    
    new_df = new_df.join(df_fpkm)
    for cancertype in cancertypes:
        table = []
        for i, tumor_type in enumerate(new_df.loc[:, "Tumor Type"]):
            if cancertype == tumor_type:
                table.append(new_df.iloc[i, :])
        specific_df = pd.DataFrame(table)
        
#         filename = "%s_%s_%s.csv" % (tissue, cancertype, datatype)
        print("Saving %s" % filename)
        
        
#         specific_df.to_csv("/".join((data_dir, filename)))
        print("Finished %s\n" % filename)
        
print("Completely finished")