In [6]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [7]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [8]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [9]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])
df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.708333,1.0,1.000000,0.725000,0.386401,0.5,0.000000,0.666667,0.0,0.556818,0.000000,0.000000,0.000000
1,0.791667,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.355556,0.5,0.465909,0.333333,1.000000,0.333333
2,0.791667,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.511111,0.5,0.590909,0.333333,0.666667,0.666667
4,0.250000,0.0,0.333333,0.650000,0.338308,0.0,0.000000,0.829630,0.0,0.454545,0.666667,0.000000,0.333333
5,0.562500,1.0,0.333333,0.600000,0.391376,0.0,0.333333,0.874074,0.0,0.386364,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,0.354167,1.0,0.000000,0.670000,0.514096,0.0,0.333333,0.488889,0.0,0.295455,1.000000,0.256313,0.333333
915,0.520833,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.696296,0.0,0.295455,1.000000,0.256313,1.000000
917,0.541667,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.296296,0.0,0.295455,1.000000,0.256313,0.000000
918,0.604167,1.0,0.000000,0.663524,0.638474,0.5,0.000000,0.561841,1.0,0.403916,1.000000,0.256313,1.000000


In [10]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,1,3,145.000000,233.0,1,0,150.000000,0,2.300000,0,0.000000,0
1,67,1,0,160.000000,286.0,0,0,108.000000,1,1.500000,1,3.000000,1
2,67,1,0,120.000000,229.0,0,0,129.000000,1,2.600000,1,2.000000,2
4,41,0,1,130.000000,204.0,0,0,172.000000,0,1.400000,2,0.000000,1
5,56,1,1,120.000000,236.0,0,1,178.000000,0,0.800000,2,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,1,0,134.000000,310.0,0,1,126.000000,0,0.000000,3,0.768939,1
915,54,0,0,127.000000,333.0,1,2,154.000000,0,0.000000,3,0.768939,3
917,55,1,0,122.000000,223.0,1,2,100.000000,0,0.000000,3,0.768939,0
918,58,1,0,132.704852,385.0,1,0,135.848525,2,0.954459,3,0.768939,3


In [11]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [12]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797