In [1]:
import pandas as pd
import os
import csv
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

In [24]:
# Map Case ID to Primary Diagnosis and Gender
case_info = {}
label_df = pd.read_csv("processed_labels/clinical.tsv", sep='\t')[["case_id", "gender", "primary_diagnosis"]]
for i, row in label_df.iterrows(): 
    case_info[row.case_id] = {"gender": row.gender, "primary_diagnosis": row.primary_diagnosis}


In [49]:
# View Data Example
datapath = "processed_data/c31900a4-5dcd-4022-97ac-638e86e889e4/TCGA-BRCA.52dbb525-a0ad-4dce-bb51-650f97510c6e.absolute_liftover.gene_level_copy_number.v36.tsv" 
df = pd.read_csv(datapath, sep='\t')
df


Unnamed: 0,gene_id,gene_name,chromosome,start,end,copy_number,min_copy_number,max_copy_number
0,ENSG00000223972.5,DDX11L1,chr1,11869,14409,,,
1,ENSG00000227232.5,WASH7P,chr1,14404,29570,,,
2,ENSG00000278267.1,MIR6859-1,chr1,17369,17436,,,
3,ENSG00000243485.5,MIR1302-2HG,chr1,29554,31109,,,
4,ENSG00000284332.1,MIR1302-2,chr1,30366,30503,,,
...,...,...,...,...,...,...,...,...
60618,ENSG00000124334.17_PAR_Y,IL9R,chrY,57184101,57197337,,,
60619,ENSG00000270726.6_PAR_Y,AJ271736.1,chrY,57190738,57208756,,,
60620,ENSG00000185203.12_PAR_Y,WASIR1,chrY,57201143,57203357,,,
60621,ENSG00000182484.15_PAR_Y,WASH6P,chrY,57207346,57212230,,,


In [50]:
# FUNCTION: Encode Input Data into Numeric Array Data 
def encode_data(datapath): 
    # Load DF
    df = pd.read_csv(datapath, sep='\t') 

    # Additional Processing
    df = df.drop(columns=['gene_name', 'gene_id', 'max_copy_number', 'min_copy_number'], axis=1)
    df = df.fillna(0)
    df["gene_length"] = abs(df.start - df.end)

    # Perform one-hot encoding for categorical variables (gene_name and chromosome)
    df = pd.get_dummies(df, columns=['chromosome']).astype(int)

    # Combine Data 
    df = df.drop(['start', 'end'], axis=1)
    arr_data = df.to_numpy()

    return arr_data


# Example ----------------------------------------------------------------------------------
case_id = "c31900a4-5dcd-4022-97ac-638e86e889e4"
file_name = "TCGA-BRCA.52dbb525-a0ad-4dce-bb51-650f97510c6e.ascat3.gene_level_copy_number.v36.tsv"
datapath = "processed_data/" + case_id + "/" + file_name

# Set up Labels, x = matrix(df) and y = prognosis
gender = case_info[case_id]["gender"]
x = encode_data(datapath)
y = case_info[case_id]["primary_diagnosis"]

print(y) 
print(gender)
x 

Infiltrating duct carcinoma, NOS
female


array([[    0,  2540,     1, ...,     0,     0,     0],
       [    0, 15166,     1, ...,     0,     0,     0],
       [    0,    67,     1, ...,     0,     0,     0],
       ...,
       [    0,  2214,     0, ...,     0,     0,     1],
       [    0,  4884,     0, ...,     0,     0,     1],
       [    0,  2213,     0, ...,     0,     0,     1]])

In [54]:
# Encode All Processed Data - Takes about 1 min. per file
gender_list = []
x_list = []
y_list = [] 

for case_id in tqdm(os.listdir("processed_data")): 
    # Get Diagnosis
    primary_diagnosis = case_info[case_id]["primary_diagnosis"]
    gender = case_info[case_id]["gender"]

    # Encode Data 
    for file_name in os.listdir("processed_data/" + case_id): 
        datapath = "processed_data/" + case_id + "/" + file_name 
        x_list.append(encode_data(datapath))

        y_list.append(primary_diagnosis)
        gender_list.append(gender)

# Write Out to CSV File 
data_list = zip(gender_list, x_list, y_list)
model_data_df = pd.DataFrame(data_list, columns=["gender", "patient_data", "label"])
model_data_df.to_csv("model_data.tsv", sep='\t', index=False)
model_data_df

 33%|███▎      | 6/18 [00:01<00:03,  3.39it/s]


KeyError: 'fe2cd610-aa52-4789-ac62-7683281bb22f'

In [48]:
# Test Loading Data
model_data_df = pd.read_csv("model_data.tsv", sep='\t')
model_data_df

Unnamed: 0,patient_data,label
0,[[ 11869 14409 0 ... 0 ...,"Infiltrating duct carcinoma, NOS"
