In [1]:
# Data Processing Libraries
import pandas as pd
import os
import csv
import numpy as np
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt

# Model Libraries 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras import regularizers

2023-12-06 21:19:24.320001: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# View Singular Sample Data (Example)
datapath = "../data/processed_data/0045349c-69d9-4306-a403-c9c1fa836644/4ebb1ec1-1b17-49f8-b5ca-79893b22049b.wgs.ASCAT.gene_level.copy_number_variation.tsv" 
df = pd.read_csv(datapath, sep='\t')
df


Unnamed: 0,gene_id,gene_name,chromosome,start,end,copy_number,min_copy_number,max_copy_number
0,ENSG00000223972.5,DDX11L1,chr1,11869,14409,2.0,2.0,2.0
1,ENSG00000227232.5,WASH7P,chr1,14404,29570,2.0,2.0,2.0
2,ENSG00000278267.1,MIR6859-1,chr1,17369,17436,2.0,2.0,2.0
3,ENSG00000243485.5,MIR1302-2HG,chr1,29554,31109,2.0,2.0,2.0
4,ENSG00000284332.1,MIR1302-2,chr1,30366,30503,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...
60618,ENSG00000124334.17_PAR_Y,IL9R,chrY,57184101,57197337,,,
60619,ENSG00000270726.6_PAR_Y,AJ271736.1,chrY,57190738,57208756,,,
60620,ENSG00000185203.12_PAR_Y,WASIR1,chrY,57201143,57203357,,,
60621,ENSG00000182484.15_PAR_Y,WASH6P,chrY,57207346,57212230,,,


### Load, Clean, and Process Data

In [3]:
# Map Case ID to Primary Diagnosis and Gender
case_info = {}
label_df = pd.read_csv("../data/processed_labels/clinical.tsv", sep='\t')[["case_id", "race", "primary_diagnosis"]]
for i, row in label_df.iterrows(): 
    case_info[row.case_id] = {"race": row.race, "primary_diagnosis": row.primary_diagnosis}


In [4]:
# FUNCTION: Encode Input Data into Numeric Array Data 
#          [note - final array only includes gene length, chromosome num. and CNV]
def encode_data(datapath): 
    # Load DF
    df = pd.read_csv(datapath, sep='\t') 

    # Additional Processing
    df = df.drop(columns=['gene_name', 'gene_id', 'max_copy_number', 'min_copy_number'], axis=1)
    df = df.fillna(0)
    df["gene_length"] = abs(df.start - df.end)

    # Perform one-hot encoding for categorical variables (gene_name and chromosome)
    df = pd.get_dummies(df, columns=['chromosome']).astype(int)

    # Clean and Normalize 
    df = df.drop(['start', 'end'], axis=1)
    col_min = df['gene_length'].min()
    col_max = df['gene_length'].max()
    df['gene_length'] = (df['gene_length'] - col_min) / (col_max - col_min)

    # Convert to Array 
    arr_data = df.to_numpy()

    return arr_data


# Example ----------------------------------------------------------------------------------
datapath = "../data/processed_data/0045349c-69d9-4306-a403-c9c1fa836644/4ebb1ec1-1b17-49f8-b5ca-79893b22049b.wgs.ASCAT.gene_level.copy_number_variation.tsv"
case_id = "0045349c-69d9-4306-a403-c9c1fa836644"


# Set up Labels, x = matrix(df) and y = prognosis
race = case_info[case_id]["race"]
x = encode_data(datapath)
y = case_info[case_id]["primary_diagnosis"]

print(y) 
print(race)

np.shape(x)


Adenoid cystic carcinoma
white


(60623, 26)

In [6]:
# Encode All Processed Data [Takes 5 minutes]
race_list = []
x_list = []
y_list = [] 

# data cleaning tracker 
cases_discarded = []

for case_id in tqdm(os.listdir("../data/processed_data")): 
    # CLEAN DATA: Discard data pt.'s with no label 
    if case_id not in case_info: 
        cases_discarded.append(case_id)
        continue

    # Get Diagnosis
    primary_diagnosis = case_info[case_id]["primary_diagnosis"]
    race = case_info[case_id]["race"]

    # Encode Data 
    for file_name in os.listdir("../data/processed_data/" + case_id): 
        datapath = "../data/processed_data/" + case_id + "/" + file_name 
        x_list.append(encode_data(datapath))

        y_list.append(primary_diagnosis)
        race_list.append(race)

print("\n----- CLEANED DATA ------") 
print("Cases Discarded:", cases_discarded)
print("\n-------------------------")

# Write Out to CSV File 
data_list = zip(race_list, x_list, y_list)
model_data_df = pd.DataFrame(data_list, columns=["race", "sample_data", "label"])
model_data_df.to_pickle("../data/model_data.pkl")
model_data_df

100%|██████████| 902/902 [04:02<00:00,  3.72it/s]



----- CLEANED DATA ------
Cases Discarded: []

-------------------------


Unnamed: 0,race,sample_data,label
0,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Metaplastic carcinoma, NOS"
1,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Metaplastic carcinoma, NOS"
2,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
3,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
4,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
...,...,...,...
1495,not reported,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
1496,not reported,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
1497,black or african american,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
1498,black or african american,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
