<a href="https://colab.research.google.com/github/sungjk1999/smart-factory-quality-classification/blob/main/tabnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pytorch-tabnet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import random
import os

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [None]:
from google.colab import drive
drive.mount('/content/drive')

"""
PRODUCT_ID : 제품의 고유 ID

Y_Class : 제품 품질 상태(Target) 
    0 : 적정 기준 미달 (부적합) 
    1 : 적합 
    2 : 적정 기준 초과 (부적합)

Y_Quality : 제품 품질 관련 정량적 수치 

TIMESTAMP : 제품이 공정에 들어간 시각 

LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재) 

PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재) 

X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수
"""

dir_path = "/content/drive/MyDrive/Colab Notebooks/open"

train_df = pd.read_csv(f"{dir_path}/train.csv")
test_df = pd.read_csv(f"{dir_path}/test.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train = train_df.drop(columns=['Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

invalid = ['PRODUCT_ID', 'TIMESTAMP']

tmp = train.describe()
for col in tmp.columns:
    std = tmp.loc['std', col]
    if std == 0 or np.isnan(std):
        invalid.append(col)

test = test_df.drop(columns=invalid)
train = train.drop(columns=invalid)
train = train.fillna(0)
test = test.fillna(0)

In [None]:
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid"], p =[.8, .2], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index

In [None]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if ((types[col] == 'object')&(col!='Set')):
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = l_enc.fit_transform(train[col].values)
        test[col] = l_enc.transform(test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

LINE 6
PRODUCT_CODE 3


In [None]:
# Categorical Embedding을 위해 Categorical 변수의 차원과 idxs를 담음.
unused_feat = ['Set']
features = [ col for col in train.columns if col not in unused_feat] 
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]



X_train = train[features].values[train_indices]
y_train = train_y.values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train_y.values[valid_indices]


In [None]:
print(X_train.shape)
print(y_train.shape)
print(train.shape)
print(test.shape)
     

(485, 2429)
(485,)
(598, 2430)
(310, 2429)


In [None]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=10,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )



In [None]:
for col in train.columns:
    if (train[col].isnull().any()):
      print(col)

In [None]:
max_epochs = 150000

clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    #eval_metric=['rmse'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 1.6508  | train_accuracy: 0.29072 | valid_accuracy: 0.28319 |  0:00:01s
epoch 1  | loss: 1.46349 | train_accuracy: 0.30928 | valid_accuracy: 0.25664 |  0:00:03s
epoch 2  | loss: 1.53743 | train_accuracy: 0.35258 | valid_accuracy: 0.30088 |  0:00:05s
epoch 3  | loss: 1.4614  | train_accuracy: 0.25567 | valid_accuracy: 0.16814 |  0:00:08s
epoch 4  | loss: 1.35312 | train_accuracy: 0.30515 | valid_accuracy: 0.32743 |  0:00:11s
epoch 5  | loss: 1.40414 | train_accuracy: 0.22474 | valid_accuracy: 0.23894 |  0:00:13s
epoch 6  | loss: 1.3549  | train_accuracy: 0.37732 | valid_accuracy: 0.42478 |  0:00:14s
epoch 7  | loss: 1.33016 | train_accuracy: 0.4701  | valid_accuracy: 0.59292 |  0:00:16s
epoch 8  | loss: 1.26053 | train_accuracy: 0.56289 | valid_accuracy: 0.65487 |  0:00:17s
epoch 9  | loss: 1.27274 | train_accuracy: 0.5732  | valid_accuracy: 0.63717 |  0:00:18s
epoch 10 | loss: 1.08711 | train_accuracy: 0.27216 | valid_accuracy: 0.27434 |  0:00:19s
epoch 11 | loss: 1.11



In [None]:
preds = clf.predict(test.values)

In [None]:
submit = pd.read_csv(f'{dir_path}/sample_submission.csv')

In [None]:
submit['Y_Class'] = preds

In [None]:
submit.to_csv('./baseline_submission.csv', index=False)