In [1]:
import pandas as pd
import os
import csv
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Map Case ID to Primary Diagnosis and Gender
case_info = {}
label_df = pd.read_csv("processed_labels/clinical.tsv", sep='\t')[["case_id", "gender", "primary_diagnosis"]]
for i, row in label_df.iterrows(): 
    case_info[row.case_id] = {"gender": row.gender, "primary_diagnosis": row.primary_diagnosis}


In [10]:
# View Data 

# Example ----------------------------------------------------------------------------------
case_id = "c31900a4-5dcd-4022-97ac-638e86e889e4"
file_name = "TCGA-BRCA.52dbb525-a0ad-4dce-bb51-650f97510c6e.ascat3.gene_level_copy_number.v36.tsv"
datapath = "processed_data/" + case_id + "/" + file_name

df = pd.read_csv(datapath, sep='\t')
# Additional Processing
df = df.drop('gene_id', axis=1)
df = df.fillna(0)
df


Unnamed: 0,gene_name,chromosome,start,end,copy_number,min_copy_number,max_copy_number
0,DDX11L1,chr1,11869,14409,0.0,0.0,0.0
1,WASH7P,chr1,14404,29570,0.0,0.0,0.0
2,MIR6859-1,chr1,17369,17436,0.0,0.0,0.0
3,MIR1302-2HG,chr1,29554,31109,0.0,0.0,0.0
4,MIR1302-2,chr1,30366,30503,0.0,0.0,0.0
...,...,...,...,...,...,...,...
60618,IL9R,chrY,57184101,57197337,0.0,0.0,0.0
60619,AJ271736.1,chrY,57190738,57208756,0.0,0.0,0.0
60620,WASIR1,chrY,57201143,57203357,0.0,0.0,0.0
60621,WASH6P,chrY,57207346,57212230,0.0,0.0,0.0


In [8]:
# FUNCTION: Encode Input Data into Numeric Array Data 
def encode_data(datapath): 
    # Load DF
    df = pd.read_csv(datapath, sep='\t') 

    # Additional Processing
    df = df.drop('gene_id', axis=1)
    df = df.fillna(0)
    df["gene_length"] = abs(df.start - df.end)

    # Perform one-hot encoding for categorical variables (gene_name and chromosome)
    encoder = OneHotEncoder(sparse_output=False)
    encoded_data = encoder.fit_transform(df[["gene_name", "chromosome"]])

    # Combine Data 
    df = df.drop(['gene_name', 'chromosome'], axis=1)
    df_arr = df.to_numpy()
    arr_data = np.concatenate([df_arr, encoded_data], axis=1)

    return arr_data


# Example ----------------------------------------------------------------------------------
case_id = "c31900a4-5dcd-4022-97ac-638e86e889e4"
file_name = "TCGA-BRCA.52dbb525-a0ad-4dce-bb51-650f97510c6e.ascat3.gene_level_copy_number.v36.tsv"
datapath = "processed_data/" + case_id + "/" + file_name

# Set up Labels, x = matrix(df) and y = prognosis
gender = case_info[case_id]["gender"]
x = encode_data(datapath)
y = case_info[case_id]["primary_diagnosis"]

print(y) 
print(gender)
x 

Infiltrating duct carcinoma, NOS
female


array([[1.1869000e+04, 1.4409000e+04, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.4404000e+04, 2.9570000e+04, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.7369000e+04, 1.7436000e+04, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [5.7201143e+07, 5.7203357e+07, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [5.7207346e+07, 5.7212230e+07, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [5.7212184e+07, 5.7214397e+07, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00]])

In [9]:
# Encode All Processed Data - Takes about 1 min. per file
gender_list = []
x_list = []
y_list = [] 

for case_id in tqdm(os.listdir("processed_data")): 
    # Get Diagnosis
    primary_diagnosis = case_info[case_id]["primary_diagnosis"]
    gender = case_info[case_id]["gender"]

    # Encode Data 
    for file_name in os.listdir("processed_data/" + case_id): 
        datapath = "processed_data/" + case_id + "/" + file_name 
        x_list.append(encode_data(datapath))

        y_list.append(primary_diagnosis)
        gender_list.append(gender)

# Write Out to CSV File 
data_list = zip(gender_list, x_list, y_list)
model_data_df = pd.DataFrame(data_list, columns=["gender", "patient_data", "label"])
model_data_df.to_csv("model_data.tsv", sep='\t', index=False)
model_data_df

  0%|          | 0/18 [00:00<?, ?it/s]

: 

In [48]:
# Test Loading Data
model_data_df = pd.read_csv("model_data.tsv", sep='\t')
model_data_df

Unnamed: 0,patient_data,label
0,[[ 11869 14409 0 ... 0 ...,"Infiltrating duct carcinoma, NOS"
