In [6]:
"""Reported (reproduced) results of of TabNet model in the original paper
https://arxiv.org/abs/1908.07442.

Forest Cover Type: 96.99 (96.53)
KDD Census Income: 95.5 (95.41)
"""

import numpy as np
import pandas as pd 

import argparse
import os.path as osp

import torch
import torch.nn.functional as F
from torch.optim.lr_scheduler import ExponentialLR
from tqdm import tqdm

from torch_frame import stype
from torch_frame.data import Dataset, DataLoader
from torch_frame.datasets import ForestCoverType, KDDCensusIncome
from torch_frame.nn import TabNet
from torch_frame.nn import (
    EmbeddingEncoder,
    FTTransformer,
    LinearBucketEncoder,
    LinearEncoder,
    LinearPeriodicEncoder,
    ResNet,
)

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default="ForestCoverType",
                    choices=["ForestCoverType", "KDDCensusIncome"])
parser.add_argument('--channels', type=int, default=128)
parser.add_argument('--gamma', type=int, default=1.2)
parser.add_argument('--num_layers', type=int, default=6)
parser.add_argument('--batch_size', type=int, default=4096)
parser.add_argument('--lr', type=float, default=0.005)
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--compile', action='store_true')

_StoreTrueAction(option_strings=['--compile'], dest='compile', nargs=0, const=True, default=False, type=None, choices=None, required=False, help=None, metavar=None)

하이퍼 파라미터 조절

In [5]:
args = parser.parse_args([
    "--channels", "256",
    "--num_layers", "4",
    "--batch_size", "256",
    "--lr", "0.0001",
    "--epochs", "15"
])

torch.manual_seed(args.seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 데이터 불러오기 ( => 이 부분만 조작)

- object => categorical
- int / float => numerical

### Alzheimers_Prediction

In [39]:
df = pd.read_csv("alzheimers_prediction_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74283 entries, 0 to 74282
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               74283 non-null  object 
 1   Age                                   74283 non-null  int64  
 2   Gender                                74283 non-null  object 
 3   Education Level                       74283 non-null  int64  
 4   BMI                                   74283 non-null  float64
 5   Physical Activity Level               74283 non-null  object 
 6   Smoking Status                        74283 non-null  object 
 7   Alcohol Consumption                   74283 non-null  object 
 8   Diabetes                              74283 non-null  object 
 9   Hypertension                          74283 non-null  object 
 10  Cholesterol Level                     74283 non-null  object 
 11  Family History 

In [40]:
df["Alzheimer’s Diagnosis"].unique()    # 진단받았는지 여부

array(['No', 'Yes'], dtype=object)

- object => categorical
- int / float => numerical

In [41]:
from torch_frame import numerical, categorical, text_embedded, embedding


## 칼럼 별 Dtype 지정 
col_to_stype={"Country" : categorical,
              "Age" : numerical,
              "Gender" : categorical,
              "Education Level" : numerical,
              "BMI" : numerical,
              "Physical Activity Level" : categorical,
              "Smoking Status" : categorical,
              "Alcohol Consumption" : categorical,
              "Diabetes" : categorical,
              "Hypertension" : categorical,
              "Cholesterol Level" : categorical,
              "Family History of Alzheimer’s" : categorical,
              "Cognitive Test Score" : numerical,
              "Depression Level" : categorical,
              "Sleep Quality" : categorical,
              "Dietary Habits" : categorical,
              "Air Pollution Exposure" : categorical,
              "Employment Status" : categorical,
              "Marital Status" : categorical,
              "Genetic Risk Factor (APOE-ε4 allele)" : categorical,
              "Social Engagement Level" : categorical,
              "Income Level" : categorical,
              "Stress Levels" : categorical,
              "Urban vs Rural Living" : categorical,
              "Alzheimer’s Diagnosis" : categorical
              }

dataset = Dataset(df = df, 
                  col_to_stype = col_to_stype, 
                  target_col = "Alzheimer’s Diagnosis")

dataset.materialize()

Dataset()

이 코드로 하니까 되네??? <br>
DataFrame 형식이 아니라 이제 materialize가 됨. <br>
나중에 더 자세히 알아보쟈 

In [42]:
## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [43]:
# 분류 task
is_classification = True

In [45]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

In [None]:
print(train_tensor_frame.col_names_dict)

{<stype.categorical: 'categorical'>: ['Air Pollution Exposure', 'Alcohol Consumption', 'Cholesterol Level', 'Country', 'Depression Level', 'Diabetes', 'Dietary Habits', 'Employment Status', 'Family History of Alzheimer’s', 'Gender', 'Genetic Risk Factor (APOE-ε4 allele)', 'Hypertension', 'Income Level', 'Marital Status', 'Physical Activity Level', 'Sleep Quality', 'Smoking Status', 'Social Engagement Level', 'Stress Levels', 'Urban vs Rural Living'], <stype.numerical: 'numerical'>: ['Age', 'BMI', 'Cognitive Test Score', 'Education Level']}


### Heart Disease Risk

In [13]:
# 분류 task
is_classification = True

In [14]:
df = pd.read_csv("heart_disease_risk_dataset_earlymed.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Chest_Pain           70000 non-null  float64
 1   Shortness_of_Breath  70000 non-null  float64
 2   Fatigue              70000 non-null  float64
 3   Palpitations         70000 non-null  float64
 4   Dizziness            70000 non-null  float64
 5   Swelling             70000 non-null  float64
 6   Pain_Arms_Jaw_Back   70000 non-null  float64
 7   Cold_Sweats_Nausea   70000 non-null  float64
 8   High_BP              70000 non-null  float64
 9   High_Cholesterol     70000 non-null  float64
 10  Diabetes             70000 non-null  float64
 11  Smoking              70000 non-null  float64
 12  Obesity              70000 non-null  float64
 13  Sedentary_Lifestyle  70000 non-null  float64
 14  Family_History       70000 non-null  float64
 15  Chronic_Stress       70000 non-null 

In [15]:
df["Heart_Risk"].unique()

array([0., 1.])

In [16]:
df["Heart_Risk"] = df["Heart_Risk"].astype(int)
df["Heart_Risk"].unique()

array([0, 1])

In [23]:
## 직접 설정
out_channels = 2

In [20]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

## 칼럼 별 Dtype 지정 
col_to_stype={"Chest_Pain" : numerical,
              "Shortness_of_Breath" : numerical,
              "Fatigue" : numerical,
              "Palpitations" : numerical,
              "Dizziness" : numerical,
              "Swelling" : numerical,
              "Pain_Arms_Jaw_Back" : numerical,
              "Cold_Sweats_Nausea" : numerical,
              "High_BP" : numerical,
              "High_Cholesterol" : numerical,
              "Diabetes" : numerical,
              "Smoking" : numerical,
              "Obesity" : numerical,
              "Sedentary_Lifestyle" : numerical,
              "Family_History" : numerical,
              "Chronic_Stress" : numerical,
              "Gender" : numerical,
              "Age" : numerical,
              "Heart_Risk" : numerical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='Heart_Risk')

dataset.materialize()

## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [21]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

### Thyroid Cancer Risk

In [29]:
# 분류 task
is_classification = True

In [30]:
df = pd.read_csv("thyroid_cancer_risk_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212691 entries, 0 to 212690
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Patient_ID           212691 non-null  int64  
 1   Age                  212691 non-null  int64  
 2   Gender               212691 non-null  object 
 3   Country              212691 non-null  object 
 4   Ethnicity            212691 non-null  object 
 5   Family_History       212691 non-null  object 
 6   Radiation_Exposure   212691 non-null  object 
 7   Iodine_Deficiency    212691 non-null  object 
 8   Smoking              212691 non-null  object 
 9   Obesity              212691 non-null  object 
 10  Diabetes             212691 non-null  object 
 11  TSH_Level            212691 non-null  float64
 12  T3_Level             212691 non-null  float64
 13  T4_Level             212691 non-null  float64
 14  Nodule_Size          212691 non-null  float64
 15  Thyroid_Cancer_Ri

In [31]:
df["Diagnosis"].unique()    # 진단받았는지 여부

array(['Benign', 'Malignant'], dtype=object)

In [32]:
## 직접 설정
out_channels = 2

In [33]:
from torch_frame import numerical, categorical, text_embedded, embedding

## 칼럼 별 Dtype 지정 
col_to_stype={#"Patient_ID" : numerical,
              "Age" : numerical,
              "Gender" : categorical,
              "Country" : categorical,
              "Ethnicity" : categorical,
              "Family_History" : categorical,
              "Radiation_Exposure" : categorical,
              "Iodine_Deficiency" : categorical,
              "Smoking" : categorical,
              "Obesity" : categorical,
              "Diabetes" : categorical,
              "TSH_Level" : numerical,
              "T3_Level" : numerical,
              "T4_Level" : numerical,
              "Nodule_Size" : categorical,
              "Thyroid_Cancer_Risk" : categorical,
              "Diagnosis" : categorical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='Diagnosis')

dataset.materialize()

Dataset()

In [34]:
## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [35]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

### Breast Cancer

https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset

In [58]:
## 분류 task
is_classification = True

In [59]:
df = pd.read_csv("breast-cancer.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

특이점 : 예측 칼럼 제외 모두 수치형 변수

In [60]:
df["diagnosis"].unique()    # 이걸 예측하는 것. 

array(['M', 'B'], dtype=object)

In [61]:
## 직접 설정
out_channels = 2

- M : Malignant
- B : Benign

In [62]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

## 칼럼 별 Dtype 지정 
col_to_stype={"radius_mean" : numerical,
              "texture_mean" : numerical,
              "perimeter_mean" : numerical,
              "area_mean" :numerical,
              "smoothness_mean" : numerical,
              "compactness_mean" : numerical,
              "concavity_mean" : numerical,
              "concave points_mean" : numerical,
              "symmetry_mean" : numerical,
              "fractal_dimension_mean" : numerical,
              "radius_se" : numerical,
              "texture_se" : numerical,
              "perimeter_se" : numerical,
              "area_se" : numerical,
              "smoothness_se" : numerical,
              "compactness_se" : numerical,
              "concavity_se" : numerical,
              "concave points_se" : numerical,
              "symmetry_se" : numerical,
              "fractal_dimension_se" : numerical,
              "radius_worst" : numerical,
              "texture_worst" : numerical,
              "perimeter_worst" : numerical,
              "area_worst" : numerical,
              "smoothness_worst" : numerical,
              "compactness_worst" : numerical,
              "concavity_worst" : numerical,
              "concave points_worst" : numerical,
              "symmetry_worst" : numerical,
              "fractal_dimension_worst" : numerical,
              "diagnosis" : categorical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='diagnosis')

dataset.materialize()

## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [63]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

### Heart

https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

In [49]:
## 분류 task
is_classification = True

In [50]:
df = pd.read_csv("heart.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [51]:
df["HeartDisease"].unique()

array([0, 1])

In [52]:
## 직접 설정
out_channels = 2

In [53]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

## 칼럼 별 Dtype 지정 
col_to_stype={"Age" : numerical,
              "Sex" : categorical,
              "ChestPainType" : categorical,
              "RestingBP" : numerical,
              "Cholesterol" : numerical,
              "FastingBS" : numerical,
              "RestingECG" : categorical,
              "MaxHR" : numerical,
              "ExerciseAngina" : categorical,
              "Oldpeak" : numerical,
              "ST_Slope" : categorical,
              "HeartDisease" : numerical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='HeartDisease')

dataset.materialize()

## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [54]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

## Model

In [64]:
# Set up model and optimizer
model = TabNet(
    out_channels,
    num_layers=args.num_layers,
    split_attn_channels=args.channels,
    split_feat_channels=args.channels,
    gamma=args.gamma,
    col_stats=dataset.col_stats,
    col_names_dict=train_tensor_frame.col_names_dict,
).to(device)
model = torch.compile(model, dynamic=True) if args.compile else model
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
lr_scheduler = ExponentialLR(optimizer, gamma=0.95)

In [65]:
def train(epoch: int) -> float:
    model.train()
    loss_accum = total_count = 0

    for tf in tqdm(train_loader, desc=f'Epoch: {epoch}'):
        tf = tf.to(device)
        pred = model(tf)
        loss = F.cross_entropy(pred, tf.y.long())
        optimizer.zero_grad()
        loss.backward()
        loss_accum += float(loss) * len(tf.y)
        total_count += len(tf.y)
        optimizer.step()
    return loss_accum / total_count


@torch.no_grad()
def test(loader: DataLoader) -> float:
    model.eval()
    accum = total_count = 0

    for tf in loader:
        tf = tf.to(device)
        pred = model(tf)
        pred_class = pred.argmax(dim=-1)
        accum += float((tf.y == pred_class).sum())
        total_count += len(tf.y)

    return accum / total_count

In [66]:
best_val_acc = 0
best_test_acc = 0
for epoch in range(1, args.epochs + 1):
    train_loss = train(epoch)
    train_acc = test(train_loader)
    val_acc = test(val_loader)
    test_acc = test(test_loader)
    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
          f'Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')
    lr_scheduler.step()

print(f'Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}')

Epoch: 1: 100%|██████████| 2/2 [00:00<00:00, 14.68it/s]


Train Loss: 0.6906, Train Acc: 0.5689, Val Acc: 0.7544, Test Acc: 0.7953


Epoch: 2: 100%|██████████| 2/2 [00:00<00:00, 13.15it/s]


Train Loss: 0.6845, Train Acc: 0.6422, Val Acc: 0.8070, Test Acc: 0.8304


Epoch: 3: 100%|██████████| 2/2 [00:00<00:00, 17.08it/s]


Train Loss: 0.6797, Train Acc: 0.7038, Val Acc: 0.8596, Test Acc: 0.8889


Epoch: 4: 100%|██████████| 2/2 [00:00<00:00, 13.07it/s]


Train Loss: 0.6742, Train Acc: 0.7449, Val Acc: 0.9649, Test Acc: 0.9181


Epoch: 5: 100%|██████████| 2/2 [00:00<00:00, 14.90it/s]


Train Loss: 0.6687, Train Acc: 0.7713, Val Acc: 0.9649, Test Acc: 0.9298


Epoch: 6: 100%|██████████| 2/2 [00:00<00:00, 14.27it/s]


Train Loss: 0.6639, Train Acc: 0.8065, Val Acc: 0.9649, Test Acc: 0.9415


Epoch: 7: 100%|██████████| 2/2 [00:00<00:00, 12.92it/s]


Train Loss: 0.6590, Train Acc: 0.8299, Val Acc: 0.9825, Test Acc: 0.9474


Epoch: 8: 100%|██████████| 2/2 [00:00<00:00, 13.67it/s]


Train Loss: 0.6535, Train Acc: 0.8475, Val Acc: 0.9825, Test Acc: 0.9591


Epoch: 9: 100%|██████████| 2/2 [00:00<00:00, 13.57it/s]


Train Loss: 0.6477, Train Acc: 0.8592, Val Acc: 0.9825, Test Acc: 0.9649


Epoch: 10: 100%|██████████| 2/2 [00:00<00:00, 15.92it/s]


Train Loss: 0.6433, Train Acc: 0.8710, Val Acc: 0.9825, Test Acc: 0.9649


Epoch: 11: 100%|██████████| 2/2 [00:00<00:00, 14.07it/s]


Train Loss: 0.6382, Train Acc: 0.8798, Val Acc: 0.9825, Test Acc: 0.9649


Epoch: 12: 100%|██████████| 2/2 [00:00<00:00, 16.46it/s]


Train Loss: 0.6328, Train Acc: 0.8856, Val Acc: 0.9825, Test Acc: 0.9649


Epoch: 13: 100%|██████████| 2/2 [00:00<00:00, 14.50it/s]


Train Loss: 0.6269, Train Acc: 0.8974, Val Acc: 0.9825, Test Acc: 0.9708


Epoch: 14: 100%|██████████| 2/2 [00:00<00:00, 14.52it/s]


Train Loss: 0.6235, Train Acc: 0.9003, Val Acc: 0.9825, Test Acc: 0.9649


Epoch: 15: 100%|██████████| 2/2 [00:00<00:00, 14.22it/s]


Train Loss: 0.6175, Train Acc: 0.9062, Val Acc: 0.9825, Test Acc: 0.9708
Best Val Acc: 0.9825, Best Test Acc: 0.9474
