In [1]:
# Data Processing Libraries
import pandas as pd
import os
import csv
import numpy as np
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt

# Model Libraries 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras import regularizers

# # Tabnet Classifier
# from tabnet_package.tabnet import TabNetClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
import math 

2023-12-11 22:56:50.318897: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load Model Data 
Checkpoint: Load cached data here

In [2]:
# Test Loading Data
model_data_df = pickle.load(open('../data/model_data.pkl', 'rb'))
model_data_df

Unnamed: 0,race,sample_data,label
0,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Metaplastic carcinoma, NOS"
1,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Metaplastic carcinoma, NOS"
2,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
3,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
4,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
...,...,...,...
1495,not reported,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
1496,not reported,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
1497,black or african american,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"
1498,black or african american,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...","Infiltrating duct carcinoma, NOS"


### Preprocess Data

In [3]:
# ONE HOT ENCODE DATA
encoded_df = model_data_df[["race", "sample_data"]]
encoded_df["label"] = pd.factorize(model_data_df.label)[0]
encoded_df


Unnamed: 0,race,sample_data,label
0,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",0
1,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",0
2,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",1
3,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",1
4,white,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",1
...,...,...,...
1495,not reported,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",1
1496,not reported,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",1
1497,black or african american,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",1
1498,black or african american,"[[0.0, 0.0010240421486530794, 1.0, 0.0, 0.0, 0...",1


In [4]:
# Filter Data Frames by Race 
white_df = encoded_df[encoded_df["race"] == "white"]
black_df = encoded_df[encoded_df["race"] == "black or african american"]
asian_df = encoded_df[encoded_df["race"] == "asian"]
prognosis_names = encoded_df.label.unique()

# Train Test Split
def df_split(df): 
    split_index = int(0.7 * len(df)) 
    return df.iloc[:split_index], df.iloc[split_index:]

train_white_df, test_white_df = df_split(white_df) 
train_black_df, test_black_df = df_split(black_df) 
train_asian_df, test_asian_df = df_split(asian_df)

# Concatenate Final Sets 
train_df = train_white_df
# train_df = pd.concat([train_white_df, train_black_df, train_asian_df])

### SVM Classifier Model
Train White, Test Black / Asians

In [5]:
# Prepare Model Data --> [takes 2 min.]
pca = PCA(n_components=1)

def get_x_y(df): 
    x_init = np.array([pca.fit_transform(np.transpose(data)) for data in df.sample_data.values]) # shrink to 26 values
    x = x_init.reshape(x_init.shape[0], -1)
    y = df.label.values
    return x, y

def train_val_split(data):
    # returns: x_train, x_val, y_train, y_val
    x, y = data
    return train_test_split(x, y, test_size=0.15, random_state=42)


##### Main Code #########
x_train, x_val, y_train, y_val = train_val_split(get_x_y(train_df)) # training data

x_test_white, y_test_white = get_x_y(test_white_df)
x_test_black, y_test_black = get_x_y(test_black_df)
x_test_asian, y_test_asian = get_x_y(test_asian_df)

In [6]:
# Initialize the SVM model
svm = SVC(kernel='linear')  # You can change the kernel type ('linear', 'rbf', 'poly', etc.)

# Train the SVM model
svm.fit(x_train, y_train)

# Evaluate the model
accuracy = svm.score(x_test_black, y_test_black)
print("Black Patients")
print("-------------------------")
print("Accuracy:", accuracy)


Black Patients
-------------------------
Accuracy: 0.7564102564102564


In [7]:
# Evaluate the model
accuracy = svm.score(x_test_asian, y_test_asian)
print("Asian Patients")
print("-------------------------")
print("Accuracy:", accuracy)

Asian Patients
-------------------------
Accuracy: 0.6785714285714286
