https://github.com/pyg-team/pytorch-frame/blob/master/benchmark/data_frame_benchmark.py

In [26]:
pwd

'c:\\Users\\cleo7\\OneDrive\\바탕 화면\\LAB\\PyTorch_Frame\\pytorch-frame-0.2.4'

In [1]:
import argparse
import math
import os
import os.path as osp
import time
from typing import Any, Optional

import pandas as pd
import numpy as np
import optuna
import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, Module, MSELoss
from torch.optim.lr_scheduler import ExponentialLR
from torchmetrics import AUROC, Accuracy, MeanSquaredError
import torch.nn.functional as F
from tqdm import tqdm

from torch_frame import stype
from torch_frame.data import Dataset, DataLoader
from torch_frame.datasets import DataFrameBenchmark
from torch_frame.gbdt import CatBoost, LightGBM, XGBoost
from torch_frame.nn import (
    EmbeddingEncoder,
    FTTransformer,
    LinearBucketEncoder,
    LinearEncoder,
    LinearPeriodicEncoder,
    ResNet,
)
from torch_frame.nn.models import (
    MLP,
    ExcelFormer,
    FTTransformer,
    ResNet,
    TabNet,
    TabTransformer,
    Trompt,
)
from torch_frame.typing import TaskType

# Use GPU for faster training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='adult')
parser.add_argument('--numerical_encoder_type', type=str, default='linear',
                    choices=['linear', 'linearbucket', 'linearperiodic'])
parser.add_argument('--model_type', type=str, default='fttransformer',
                    choices=['fttransformer', 'resnet'])
parser.add_argument('--channels', type=int, default=256)
parser.add_argument('--num_layers', type=int, default=4)
parser.add_argument('--batch_size', type=int, default=512)
parser.add_argument('--lr', type=float, default=0.0001)
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--compile', action='store_true')


_StoreTrueAction(option_strings=['--compile'], dest='compile', nargs=0, const=True, default=False, type=None, choices=None, required=False, help=None, metavar=None)

In [3]:
from torch import Tensor
from transformers import AutoModel, AutoTokenizer

class TextToEmbedding:
    def __init__(self, model: str, device: torch.device):
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModel.from_pretrained(model).to(device)
        self.device = device

    def __call__(self, sentences: list[str]) -> Tensor:
        inputs = self.tokenizer(
            sentences,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        for key in inputs:
            if isinstance(inputs[key], Tensor):
                inputs[key] = inputs[key].to(self.device)
        out = self.model(**inputs)
        mask = inputs["attention_mask"]
        return out.last_hidden_state[:, 0, :].detach().cpu()

In [75]:
# Jupyter에서 실행될 때는 sys.argv를 조정
args = parser.parse_args([
    #'--dataset', 'adult',
    '--numerical_encoder_type', 'linear',
    '--model_type', 'resnet',       # fttransformer : FT-T / resnet : ResNet
    '--channels', '256',
    '--num_layers', '4',
    '--batch_size', '256',  # 데이터를 256개씩 한번에 
    '--lr', '0.0001',
    '--epochs', '15',
    '--seed', '0'
])

## 데이터 불러오기 ( => 이 부분만 조작)

- object => categorical
- int / float => numerical

### Alzheimers_Prediction

In [5]:
df = pd.read_csv("alzheimers_prediction_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74283 entries, 0 to 74282
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               74283 non-null  object 
 1   Age                                   74283 non-null  int64  
 2   Gender                                74283 non-null  object 
 3   Education Level                       74283 non-null  int64  
 4   BMI                                   74283 non-null  float64
 5   Physical Activity Level               74283 non-null  object 
 6   Smoking Status                        74283 non-null  object 
 7   Alcohol Consumption                   74283 non-null  object 
 8   Diabetes                              74283 non-null  object 
 9   Hypertension                          74283 non-null  object 
 10  Cholesterol Level                     74283 non-null  object 
 11  Family History 

In [6]:
df["Alzheimer’s Diagnosis"].unique()    # 진단받았는지 여부

array(['No', 'Yes'], dtype=object)

- object => categorical
- int / float => numerical

In [7]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

text_encoder = TextToEmbedding(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
                                device=device)

## 칼럼 별 Dtype 지정 
col_to_stype={"Country" : categorical,
              "Age" : numerical,
              "Gender" : categorical,
              "Education Level" : numerical,
              "BMI" : numerical,
              "Physical Activity Level" : categorical,
              "Smoking Status" : categorical,
              "Alcohol Consumption" : categorical,
              "Diabetes" : categorical,
              "Hypertension" : categorical,
              "Cholesterol Level" : categorical,
              "Family History of Alzheimer’s" : categorical,
              "Cognitive Test Score" : numerical,
              "Depression Level" : categorical,
              "Sleep Quality" : categorical,
              "Dietary Habits" : categorical,
              "Air Pollution Exposure" : categorical,
              "Employment Status" : categorical,
              "Marital Status" : categorical,
              "Genetic Risk Factor (APOE-ε4 allele)" : categorical,
              "Social Engagement Level" : categorical,
              "Income Level" : categorical,
              "Stress Levels" : categorical,
              "Urban vs Rural Living" : categorical,
              "Alzheimer’s Diagnosis" : categorical
              }

dataset = Dataset(df = df, 
                  col_to_stype = col_to_stype, 
                  target_col = "Alzheimer’s Diagnosis",
                  col_to_text_embedder_cfg = TextEmbedderConfig(text_embedder=text_encoder, batch_size=32))

dataset.materialize()

Dataset()

이 코드로 하니까 되네??? <br>
DataFrame 형식이 아니라 이제 materialize가 됨. <br>
나중에 더 자세히 알아보쟈 

In [8]:
## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [9]:
# 분류 task
is_classification = True

In [10]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

if args.numerical_encoder_type == 'linear':
    numerical_encoder = LinearEncoder()
elif args.numerical_encoder_type == 'linearbucket':
    numerical_encoder = LinearBucketEncoder()
elif args.numerical_encoder_type == 'linearperiodic':
    numerical_encoder = LinearPeriodicEncoder()
else:
    raise ValueError(
        f'Unsupported encoder type: {args.numerical_encoder_type}')

stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: numerical_encoder,
}

if is_classification:
    #output_channels = dataset.num_classes    ->   contains StatType.COUNT을 포함하지 않아서 오류(?)
    output_channels = 2 # 그냥 수동으로 설정.,,,,   => 분류 칼럼 unique 개수로 설정 
else:
    output_channels = 1

In [11]:
print(train_tensor_frame.col_names_dict)

{<stype.categorical: 'categorical'>: ['Air Pollution Exposure', 'Alcohol Consumption', 'Cholesterol Level', 'Country', 'Depression Level', 'Diabetes', 'Dietary Habits', 'Employment Status', 'Family History of Alzheimer’s', 'Gender', 'Genetic Risk Factor (APOE-ε4 allele)', 'Hypertension', 'Income Level', 'Marital Status', 'Physical Activity Level', 'Sleep Quality', 'Smoking Status', 'Social Engagement Level', 'Stress Levels', 'Urban vs Rural Living'], <stype.numerical: 'numerical'>: ['Age', 'BMI', 'Cognitive Test Score', 'Education Level']}


### Heart Disease Risk

In [12]:
# 분류 task
is_classification = True

In [13]:
df = pd.read_csv("heart_disease_risk_dataset_earlymed.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Chest_Pain           70000 non-null  float64
 1   Shortness_of_Breath  70000 non-null  float64
 2   Fatigue              70000 non-null  float64
 3   Palpitations         70000 non-null  float64
 4   Dizziness            70000 non-null  float64
 5   Swelling             70000 non-null  float64
 6   Pain_Arms_Jaw_Back   70000 non-null  float64
 7   Cold_Sweats_Nausea   70000 non-null  float64
 8   High_BP              70000 non-null  float64
 9   High_Cholesterol     70000 non-null  float64
 10  Diabetes             70000 non-null  float64
 11  Smoking              70000 non-null  float64
 12  Obesity              70000 non-null  float64
 13  Sedentary_Lifestyle  70000 non-null  float64
 14  Family_History       70000 non-null  float64
 15  Chronic_Stress       70000 non-null 

In [14]:
df["Heart_Risk"].unique()

array([0., 1.])

In [15]:
df["Heart_Risk"] = df["Heart_Risk"].astype(int)
df["Heart_Risk"].unique()

array([0, 1])

In [16]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

text_encoder = TextToEmbedding(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
                                device=device)

## 칼럼 별 Dtype 지정 
col_to_stype={"Chest_Pain" : numerical,
              "Shortness_of_Breath" : numerical,
              "Fatigue" : numerical,
              "Palpitations" : numerical,
              "Dizziness" : numerical,
              "Swelling" : numerical,
              "Pain_Arms_Jaw_Back" : numerical,
              "Cold_Sweats_Nausea" : numerical,
              "High_BP" : numerical,
              "High_Cholesterol" : numerical,
              "Diabetes" : numerical,
              "Smoking" : numerical,
              "Obesity" : numerical,
              "Sedentary_Lifestyle" : numerical,
              "Family_History" : numerical,
              "Chronic_Stress" : numerical,
              "Gender" : numerical,
              "Age" : numerical,
              "Heart_Risk" : numerical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='Heart_Risk',
                  col_to_text_embedder_cfg=TextEmbedderConfig(text_embedder=text_encoder, batch_size=32))

dataset.materialize()

## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [17]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

if args.numerical_encoder_type == 'linear':
    numerical_encoder = LinearEncoder()
elif args.numerical_encoder_type == 'linearbucket':
    numerical_encoder = LinearBucketEncoder()
elif args.numerical_encoder_type == 'linearperiodic':
    numerical_encoder = LinearPeriodicEncoder()
else:
    raise ValueError(
        f'Unsupported encoder type: {args.numerical_encoder_type}')

stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: numerical_encoder,
}

if is_classification:
    #output_channels = dataset.num_classes    ->   contains StatType.COUNT을 포함하지 않아서 오류(?)
    output_channels = 2 # 그냥 수동으로 설정.,,,,
else:
    output_channels = 1

### Thyroid Cancer Risk

In [34]:
# 분류 task
is_classification = True

In [35]:
df = pd.read_csv("thyroid_cancer_risk_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212691 entries, 0 to 212690
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Patient_ID           212691 non-null  int64  
 1   Age                  212691 non-null  int64  
 2   Gender               212691 non-null  object 
 3   Country              212691 non-null  object 
 4   Ethnicity            212691 non-null  object 
 5   Family_History       212691 non-null  object 
 6   Radiation_Exposure   212691 non-null  object 
 7   Iodine_Deficiency    212691 non-null  object 
 8   Smoking              212691 non-null  object 
 9   Obesity              212691 non-null  object 
 10  Diabetes             212691 non-null  object 
 11  TSH_Level            212691 non-null  float64
 12  T3_Level             212691 non-null  float64
 13  T4_Level             212691 non-null  float64
 14  Nodule_Size          212691 non-null  float64
 15  Thyroid_Cancer_Ri

In [36]:
df["Diagnosis"].unique()    # 진단받았는지 여부

array(['Benign', 'Malignant'], dtype=object)

In [37]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

text_encoder = TextToEmbedding(model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
                                device=device)

## 칼럼 별 Dtype 지정 
col_to_stype={#"Patient_ID" : numerical,
              "Age" : numerical,
              "Gender" : categorical,
              "Country" : categorical,
              "Ethnicity" : categorical,
              "Family_History" : categorical,
              "Radiation_Exposure" : categorical,
              "Iodine_Deficiency" : categorical,
              "Smoking" : categorical,
              "Obesity" : categorical,
              "Diabetes" : categorical,
              "TSH_Level" : numerical,
              "T3_Level" : numerical,
              "T4_Level" : numerical,
              "Nodule_Size" : categorical,
              "Thyroid_Cancer_Risk" : categorical,
              "Diagnosis" : categorical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='Diagnosis',
                  col_to_text_embedder_cfg=TextEmbedderConfig(text_embedder=text_encoder, batch_size=32))

dataset.materialize()

Dataset()

In [38]:
## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [39]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

if args.numerical_encoder_type == 'linear':
    numerical_encoder = LinearEncoder()
elif args.numerical_encoder_type == 'linearbucket':
    numerical_encoder = LinearBucketEncoder()
elif args.numerical_encoder_type == 'linearperiodic':
    numerical_encoder = LinearPeriodicEncoder()
else:
    raise ValueError(
        f'Unsupported encoder type: {args.numerical_encoder_type}')

stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: numerical_encoder,
}

if is_classification:
    #output_channels = dataset.num_classes    ->   contains StatType.COUNT을 포함하지 않아서 오류(?)
    output_channels = 2 # 그냥 수동으로 설정.,,,,
else:
    output_channels = 1

### Breast Cancer

https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset

In [56]:
## 분류 task
is_classification = True

In [57]:
df = pd.read_csv("breast-cancer.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

특이점 : 예측 칼럼 제외 모두 수치형 변수

In [58]:
df["diagnosis"].unique()    # 이걸 예측하는 것. 

array(['M', 'B'], dtype=object)

- M : Malignant
- B : Benign

In [59]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

## 칼럼 별 Dtype 지정 
col_to_stype={"radius_mean" : numerical,
              "texture_mean" : numerical,
              "perimeter_mean" : numerical,
              "area_mean" :numerical,
              "smoothness_mean" : numerical,
              "compactness_mean" : numerical,
              "concavity_mean" : numerical,
              "concave points_mean" : numerical,
              "symmetry_mean" : numerical,
              "fractal_dimension_mean" : numerical,
              "radius_se" : numerical,
              "texture_se" : numerical,
              "perimeter_se" : numerical,
              "area_se" : numerical,
              "smoothness_se" : numerical,
              "compactness_se" : numerical,
              "concavity_se" : numerical,
              "concave points_se" : numerical,
              "symmetry_se" : numerical,
              "fractal_dimension_se" : numerical,
              "radius_worst" : numerical,
              "texture_worst" : numerical,
              "perimeter_worst" : numerical,
              "area_worst" : numerical,
              "smoothness_worst" : numerical,
              "compactness_worst" : numerical,
              "concavity_worst" : numerical,
              "concave points_worst" : numerical,
              "symmetry_worst" : numerical,
              "fractal_dimension_worst" : numerical,
              "diagnosis" : categorical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='diagnosis')

dataset.materialize()

## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [60]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

if args.numerical_encoder_type == 'linear':
    numerical_encoder = LinearEncoder()
elif args.numerical_encoder_type == 'linearbucket':
    numerical_encoder = LinearBucketEncoder()
elif args.numerical_encoder_type == 'linearperiodic':
    numerical_encoder = LinearPeriodicEncoder()
else:
    raise ValueError(
        f'Unsupported encoder type: {args.numerical_encoder_type}')

stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: numerical_encoder,
}

if is_classification:
    #output_channels = dataset.num_classes    ->   contains StatType.COUNT을 포함하지 않아서 오류(?)
    output_channels = 2 # 그냥 수동으로 설정.,,,,
else:
    output_channels = 1

### Heart

https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

In [65]:
## 분류 task
is_classification = True

In [66]:
df = pd.read_csv("heart.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [67]:
df["HeartDisease"].unique()

array([0, 1])

In [69]:
from torch_frame import numerical, categorical, text_embedded, embedding
from torch_frame.config.text_embedder import TextEmbedderConfig

## 칼럼 별 Dtype 지정 
col_to_stype={"Age" : numerical,
              "Sex" : categorical,
              "ChestPainType" : categorical,
              "RestingBP" : numerical,
              "Cholesterol" : numerical,
              "FastingBS" : numerical,
              "RestingECG" : categorical,
              "MaxHR" : numerical,
              "ExerciseAngina" : categorical,
              "Oldpeak" : numerical,
              "ST_Slope" : categorical,
              "HeartDisease" : numerical}

dataset = Dataset(df=df, 
                  col_to_stype=col_to_stype, 
                  target_col='HeartDisease')

dataset.materialize()

## split
train_dataset, val_dataset, test_dataset = dataset[:0.6], dataset[0.6:0.7], dataset[0.7:]

In [70]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame
train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size,
                          shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

if args.numerical_encoder_type == 'linear':
    numerical_encoder = LinearEncoder()
elif args.numerical_encoder_type == 'linearbucket':
    numerical_encoder = LinearBucketEncoder()
elif args.numerical_encoder_type == 'linearperiodic':
    numerical_encoder = LinearPeriodicEncoder()
else:
    raise ValueError(
        f'Unsupported encoder type: {args.numerical_encoder_type}')

stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: numerical_encoder,
}

if is_classification:
    #output_channels = dataset.num_classes    ->   contains StatType.COUNT을 포함하지 않아서 오류(?)
    output_channels = 2 # 그냥 수동으로 설정.,,,,
else:
    output_channels = 1

# Model

In [76]:
if args.model_type == 'fttransformer':
    model = FTTransformer(
        channels=args.channels,
        out_channels=output_channels,
        num_layers=args.num_layers,
        col_stats=dataset.col_stats,    # TensorFrame이 col_stats를 안받아서 transor frame으로 바꾸기 전으로 받아와야 함 
        col_names_dict=train_tensor_frame.col_names_dict,
        stype_encoder_dict=stype_encoder_dict,
    ).to(device)

elif args.model_type == 'resnet':
    model = ResNet(
        channels=args.channels,
        out_channels=output_channels,
        num_layers=args.num_layers,
        col_stats=dataset.col_stats,
        col_names_dict=train_tensor_frame.col_names_dict,
    ).to(device)
else:
    raise ValueError(f'Unsupported model type: {args.model_type}')

model = torch.compile(model, dynamic=True) if args.compile else model
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)

In [77]:
def train(epoch: int) -> float:
    model.train()
    loss_accum = total_count = 0

    for tf in tqdm(train_loader, desc=f'Epoch: {epoch}'):
        tf = tf.to(device)
        pred = model(tf)
        if is_classification:
            loss = F.cross_entropy(pred, tf.y.long())
        else:
            loss = F.mse_loss(pred.view(-1), tf.y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        loss_accum += float(loss) * len(tf.y)
        total_count += len(tf.y)
        optimizer.step()
    return loss_accum / total_count

PyTorch에서 cross_entropy 함수는 타겟 레이블(target)이 torch.LongTensor(정수형)이어야 함. 따라서 tf.y를 long() 타입으로 변환

In [78]:
@torch.no_grad()
def test(loader: DataLoader) -> float:
    model.eval()
    accum = total_count = 0

    for tf in loader:
        tf = tf.to(device)
        pred = model(tf)
        if is_classification:
            pred_class = pred.argmax(dim=-1)
            accum += float((tf.y == pred_class).sum())
        else:
            accum += float(
                F.mse_loss(pred.view(-1), tf.y.view(-1), reduction='sum'))
        total_count += len(tf.y)

    if is_classification:
        accuracy = accum / total_count
        return accuracy
    else:
        rmse = (accum / total_count)**0.5
        return rmse


In [79]:
# 훈련 및 테스트

if is_classification:
    metric = 'Acc'
    best_val_metric = 0
    best_test_metric = 0
else:
    metric = 'RMSE'
    best_val_metric = float('inf')
    best_test_metric = float('inf')

for epoch in range(1, args.epochs + 1):
    train_loss = train(epoch)
    train_metric = test(train_loader)
    val_metric = test(val_loader)
    test_metric = test(test_loader)

    if is_classification and val_metric > best_val_metric:
        best_val_metric = val_metric
        best_test_metric = test_metric
    elif not is_classification and val_metric < best_val_metric:
        best_val_metric = val_metric
        best_test_metric = test_metric

    print(f'Train Loss: {train_loss:.4f}, Train {metric}: {train_metric:.4f}, '
          f'Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}')

print(f'Best Val {metric}: {best_val_metric:.4f}, '
      f'Best Test {metric}: {best_test_metric:.4f}')


Epoch: 1: 100%|██████████| 3/3 [00:00<00:00, 37.57it/s]


Train Loss: 0.6165, Train Acc: 0.8893, Val Acc: 0.8370, Test Acc: 0.7855


Epoch: 2: 100%|██████████| 3/3 [00:00<00:00, 42.19it/s]


Train Loss: 0.3648, Train Acc: 0.8893, Val Acc: 0.8478, Test Acc: 0.7818


Epoch: 3: 100%|██████████| 3/3 [00:00<00:00, 41.07it/s]


Train Loss: 0.3198, Train Acc: 0.8911, Val Acc: 0.8152, Test Acc: 0.7818


Epoch: 4: 100%|██████████| 3/3 [00:00<00:00, 43.82it/s]


Train Loss: 0.2949, Train Acc: 0.8929, Val Acc: 0.8152, Test Acc: 0.7564


Epoch: 5: 100%|██████████| 3/3 [00:00<00:00, 46.63it/s]


Train Loss: 0.2881, Train Acc: 0.8911, Val Acc: 0.8370, Test Acc: 0.7818


Epoch: 6: 100%|██████████| 3/3 [00:00<00:00, 47.68it/s]


Train Loss: 0.2858, Train Acc: 0.8911, Val Acc: 0.8478, Test Acc: 0.7818


Epoch: 7: 100%|██████████| 3/3 [00:00<00:00, 45.16it/s]


Train Loss: 0.2831, Train Acc: 0.8929, Val Acc: 0.8370, Test Acc: 0.7527


Epoch: 8: 100%|██████████| 3/3 [00:00<00:00, 41.55it/s]

Train Loss: 0.2855, Train Acc: 0.9056, Val Acc: 0.8370, Test Acc: 0.7418



Epoch: 9: 100%|██████████| 3/3 [00:00<00:00, 37.97it/s]


Train Loss: 0.2784, Train Acc: 0.8966, Val Acc: 0.8370, Test Acc: 0.7491


Epoch: 10: 100%|██████████| 3/3 [00:00<00:00, 27.66it/s]


Train Loss: 0.2768, Train Acc: 0.8947, Val Acc: 0.8370, Test Acc: 0.7418


Epoch: 11: 100%|██████████| 3/3 [00:00<00:00, 48.18it/s]


Train Loss: 0.2650, Train Acc: 0.8947, Val Acc: 0.8261, Test Acc: 0.7418


Epoch: 12: 100%|██████████| 3/3 [00:00<00:00, 35.12it/s]

Train Loss: 0.2742, Train Acc: 0.8966, Val Acc: 0.8261, Test Acc: 0.7418



Epoch: 13: 100%|██████████| 3/3 [00:00<00:00, 40.28it/s]


Train Loss: 0.2653, Train Acc: 0.9093, Val Acc: 0.8261, Test Acc: 0.7418


Epoch: 14: 100%|██████████| 3/3 [00:00<00:00, 44.47it/s]


Train Loss: 0.2722, Train Acc: 0.9093, Val Acc: 0.8370, Test Acc: 0.7491


Epoch: 15: 100%|██████████| 3/3 [00:00<00:00, 37.76it/s]

Train Loss: 0.2552, Train Acc: 0.8929, Val Acc: 0.8478, Test Acc: 0.7527
Best Val Acc: 0.8478, Best Test Acc: 0.7818



