In [1]:
# I used python kernel 3.10.15 + nvidia RTX 3090 + cuda 12.1 in local machine

# install ralated modules
%pip install torch==2.1.1 torchvision==0.16.1   # compatible version of pytorch and torchvision for mamba-ssm 
%pip install causal-conv1d==1.1.1   # causal dpthwise conv 1d  module in CUDA with pytorch
%pip install mamba-ssm  # Mamba block module

Note: you may need to restart the kernel to use updated packages.
Collecting argparse (from buildtools->causal-conv1d==1.1.1)
  Using cached argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)
Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# import modules

import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tqdm

from mamba_ssm import Mamba

import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype
import matplotlib.pyplot as plt

import copy
from collections import defaultdict

In [3]:
# Check environments
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)
torch.cuda.is_available()

torch:  2.1 ; cuda:  cu121


True

In [4]:
# Get original data
!mkdir ./datasets
!mkdir ./datasets/adult
!wget -nc https://archive.ics.uci.edu/static/public/2/adult.zip
!unzip -o ./adult.zip -d ./datasets/adult
!cp -rf ./datasets/adult/adult.data ./datasets/adult/data_processed.csv

mkdir: `./datasets' 디렉터리를 만들 수 없습니다: 파일이 있습니다
mkdir: `./datasets/adult' 디렉터리를 만들 수 없습니다: 파일이 있습니다
‘adult.zip’ 파일이 이미 있습니다. 가져오지 않음.

Archive:  ./adult.zip
  inflating: ./datasets/adult/Index  
  inflating: ./datasets/adult/adult.data  
  inflating: ./datasets/adult/adult.names  
  inflating: ./datasets/adult/adult.test  
  inflating: ./datasets/adult/old.adult.names  


In [5]:
# Set Configuration for MambaTab
config={
    'DATASET_NAME':'adult',
    'SEED':15, # random seed 지정
    'BATCH':100,
    'LR':0.0001,
    'EPOCH':100,
    'MAMBA_SSM_DIM':32,  # MAMBA model의 dimension 설정 (d_model: Selective Structured State Machine에 담을 최대 Dimension)
    'device':'cuda'}

In [6]:
# data load and preparing

def read_data(dataset_name):
    data=pd.read_csv('./datasets/'+dataset_name+'/data_processed'+'.csv')
    
    # fill null values
    for col in data.columns: 
        #data[col].fillna(data[col].mode()[0], inplace=True)
        data[col] = data[col].fillna(data[col].mode()[0])

    # categorical encoder: 문자열인 경우 소문자로 통일하고, 숫자로 인코딩 처리
    for c in data.columns:
        if is_string_dtype(data[c]):
            data[c]=data[c].str.lower()
            enc=OrdinalEncoder()
            cur_data=np.array(data[c])
            cur_data=np.reshape(cur_data,(cur_data.shape[0],1))
            data[c] = enc.fit_transform(cur_data)

    # 마지막 column을 lable로 처리
    y_data=data[data.columns[-1]]

    x_data = data.drop(labels = [data.columns[-1]],axis = 1)
    x_data=MinMaxScaler().fit_transform(x_data) # 데이터 스케일링 처리
    x_data,y_data=np.array(x_data),np.array(y_data)
    
    return x_data,y_data

In [7]:
# 같은 조 성준님 클렌징 데이터 사용하기 위한 코드

def load_and_preprocess_data():
     # scikit-learn의 OpenML에서 성인 인구조사 소득 데이터셋 불러오기
    from sklearn.datasets import fetch_openml
    data = fetch_openml(name='adult', version=1, as_frame=True)
    df = data.frame

    # 특성(features)과 목표(target) 변수 분리
    # 'class' 열을 제외한 모든 열을 특성으로, 'class' 열을 목표 변수로 설정 >> X하고 y
    X = df.drop('class', axis=1)
    y = df['class']

    # 열의 데이터 유형 식별
    # 범주형(문자열) 열과 수치형(정수, 실수) 열 구분
    categorical_columns = X.select_dtypes(include=['object']).columns
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

    # 데이터 전처리 단계 생성
    # ColumnTransformer를 사용하여 수치형과 범주형 열에 대해 다른 전처리 적용
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline([                                                           # 수치형 열 처리
                ('imputer', SimpleImputer(strategy='median')),                           # 결측값을 중앙값으로 대체
                ('scaler', StandardScaler())                                             # 표준 스케일링 (평균 0, 분산 1로 정규화)
            ]), numerical_columns),

            ('cat', Pipeline([                                                           # 범주형 열 처리
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),   # 결측값을 'missing' 문자열로 대체
                ('onehot', OneHotEncoder(handle_unknown='ignore'))                       # 원핫 인코딩 (범주형 변수를 이진 벡터로 변환)
            ]), categorical_columns)
        ])

    # 특성 데이터 전처리 수행
    # 앞서 정의한 전처리기를 사용하여 데이터 변환
    X_processed = preprocessor.fit_transform(X)

    # 목표 변수 인코딩
    # LabelEncoder를 사용하여 문자열 레이블을 숫자로 변환
    le = LabelEncoder()
    y_processed = le.fit_transform(y)

    # 전처리된 특성, 목표 변수, 그리고 전처리기 반환
    return X_processed, y_processed, preprocessor


In [8]:
# MambaTab Class

class MambaTab(torch.nn.Module):

    def __init__(self,input_features,n_class,intermediate_representation=config['MAMBA_SSM_DIM']):
        super(MambaTab, self).__init__()
        self.linear_layer=torch.nn.Linear(input_features,intermediate_representation)
        self.relu=torch.nn.ReLU()
        self.layer_norm=torch.nn.LayerNorm(intermediate_representation)

        self.mamba=Mamba(d_model=intermediate_representation, d_state=32, d_conv=4, expand=2) # to fine-tuning
        self.output_layer=torch.nn.Linear(intermediate_representation,n_class)
    
    def forward(self, x):
         x=self.linear_layer(x)
         x=self.layer_norm(x)
         x=self.relu(x)
         x=self.mamba(x)
         x=self.output_layer(x)
         return x

In [9]:
# Training function

def train_model(model,config, dataloader):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 1e10
    early_stopping_counter=0

    optimizer=torch.optim.Adam(model.parameters(),lr=config['LR'])  # Optimizer setting: Adam
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['EPOCH'], eta_min=0,verbose=False)
    loss_fn=torch.nn.BCEWithLogitsLoss()    # 이진분류 처리를 위해 BCEWithLogitLoss 함수 사용
  
    for epoch in tqdm.tqdm(range(config['EPOCH'])):
        if early_stopping_counter>=5:
          break
        
        for phase in ['train', 'val']:      
            if phase == 'train':               
                model.train()  
            else:
                model.eval()  
            
            metrics = defaultdict(float)
            epoch_samples = 0
          
            for btch,feed_dict in enumerate(dataloader[phase]):
                inputs=feed_dict[0]
                inputs=inputs.unsqueeze(0)
                labels=feed_dict[1]
                
                inputs = inputs.type(torch.FloatTensor)
                inputs = inputs.to(config['device'])
                labels = labels.type(torch.FloatTensor)
                labels = labels.to(config['device'])
                
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)  
                    outputs=outputs.squeeze()  
                    loss=loss_fn(outputs,labels)
                    metrics['loss']+=loss.item()
                
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()  
                
                epoch_samples += 1 
           
            epoch_loss = metrics['loss'] / epoch_samples

            if phase == 'val':
           
                if epoch_loss<best_loss:
                    best_model_wts = copy.deepcopy(model.state_dict())
                    best_loss=epoch_loss
                    early_stopping_counter=0
                else:
                    early_stopping_counter+=1

        scheduler.step()           
    
    model.load_state_dict(best_model_wts)       
    
    return model

In [10]:
# Test and evaluation

def test_result(test_model, test_dataloader):

  test_model.eval()
  all_test_output_probas=[]

  all_test_labels=[]
  sig=torch.nn.Sigmoid()  # 이진분류 처리를 위해 BCEWithLogitLoss 함수와 함께 Sigmoid 사용

  for inputs,labels in test_dataloader['test']:
    inputs = inputs.unsqueeze(0)
    inputs = inputs.type(torch.FloatTensor)
    inputs = inputs.to(config['device'])
    
    labels = labels.to(config['device'])

    with torch.set_grad_enabled(False):
      outputs = test_model(inputs)
      outputs=outputs.squeeze()
      outputs=sig(outputs)         
      outputs=outputs.cpu().detach().numpy()
      labels=labels.cpu().detach().numpy()

      for i in range(outputs.shape[0]):
         all_test_labels.append(labels[i])
         all_test_output_probas.append(outputs[i])
         
  performance_value=roc_auc_score(all_test_labels,all_test_output_probas)
  print("AUROC score: ",performance_value)

  return all_test_labels, all_test_output_probas


In [11]:
# Data loading and data split

# 오리지널 데이터 사용
x_data,y_data=read_data(dataset_name=config['DATASET_NAME'])

x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2,random_state=config['SEED'],stratify=y_data,shuffle=True)
val_size=int(len(y_data)*0.1)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=val_size,random_state=config['SEED'],stratify=y_train, shuffle=True)

print("Train:",x_train.shape)
print("Val:",x_val.shape)
print("Test:",x_test.shape)

Train: (22792, 14)
Val: (3256, 14)
Test: (6512, 14)


In [12]:
# Data Lodader wrapper

class TabularDataLoader(Dataset):

    def __init__(self,length,data_type):
        self.length=length
        self.data_type=data_type

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if self.data_type=='train':
            return x_train[idx],y_train[idx]
        if self.data_type=='val':
            return x_val[idx],y_val[idx]
        if self.data_type=='test':
           return x_test[idx],y_test[idx]

In [13]:
# Preparing Dataloaders
train_set = TabularDataLoader(length=x_train.shape[0],data_type='train')
val_set = TabularDataLoader(length=x_val.shape[0],data_type='val')
test_set = TabularDataLoader(length=x_test.shape[0],data_type='test')

dataloader = {
      'train': DataLoader(train_set, batch_size=config['BATCH'], shuffle=True, num_workers=0),
      'val': DataLoader(val_set, batch_size=config['BATCH'], shuffle=False, num_workers=0),
      'test': DataLoader(test_set, batch_size=config['BATCH'], shuffle=False, num_workers=0)
   }

# Get the model: "n_class=1 is to use a single output logit strategy,  where n_class does not refer to the number of classes and is sufficient for binary classification"
model=MambaTab(input_features=x_train.shape[1], n_class=1)
model=model.to(config['device'])


In [14]:
#Train-validate the model
model=train_model(model,config, dataloader)

# Get test set performance
test_labels, test_probas = test_result(model, dataloader)
print("----------------Complete of original data----------------")

 73%|███████▎  | 73/100 [00:37<00:13,  1.96it/s]

AUROC score:  0.9003590210355986
----------------Complete of original data----------------





In [15]:
# 같은 조 공통 데이터 (성준님 클렌징 데이터) 사용

x_data_c, y_data_c, _ = load_and_preprocess_data()

x_train_c, x_test_c, y_train_c, y_test_c = train_test_split(x_data_c, y_data_c, test_size = 0.2, random_state = config['SEED'], stratify = y_data_c, shuffle=True)
val_size_c = int(len(y_data_c)*0.1)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(x_train_c, y_train_c, test_size = val_size_c, random_state = config['SEED'], stratify = y_train_c, shuffle=True)

In [16]:
# Data Lodader wrapper

class TabularDataLoader_c(Dataset):

    def __init__(self,length,data_type):
        self.length=length
        self.data_type=data_type

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if self.data_type=='train':
            return x_train_c[idx],y_train_c[idx]
        if self.data_type=='val':
            return x_val_c[idx],y_val_c[idx]
        if self.data_type=='test':
           return x_test_c[idx],y_test_c[idx]

In [17]:
train_set_c = TabularDataLoader_c(length = x_train_c.shape[0], data_type='train')
val_set_c = TabularDataLoader_c(length = x_val_c.shape[0], data_type='val')
test_set_c = TabularDataLoader_c(length = x_test_c.shape[0], data_type='test')

print("Train_cleansing:", x_train_c.shape)
print("Val_cleansing:", x_val_c.shape)
print("Test_cleansing:", x_test_c.shape)

dataloader_c = {
      'train': DataLoader(train_set_c, batch_size=config['BATCH'], shuffle=True, num_workers=0),
      'val': DataLoader(val_set_c, batch_size=config['BATCH'], shuffle=False, num_workers=0),
      'test': DataLoader(test_set_c, batch_size=config['BATCH'], shuffle=False, num_workers=0)
   }

model_c = MambaTab(input_features = x_train_c.shape[1], n_class=1)
model_c = model_c.to(config['device'])

model_c = train_model(model_c, config, dataloader_c)

# Get test set performance
test_labels_c, test_probas_c = test_result(model_c, dataloader_c)
print("----------------Complete of cleansing data----------------")

Train_cleansing: (34189, 2)
Val_cleansing: (4884, 2)
Test_cleansing: (9769, 2)


 34%|███▍      | 34/100 [00:29<00:57,  1.15it/s]

AUROC score:  0.7091062986202461
----------------Complete of cleansing data----------------



