#### train.ipynb 역할

train.ipynb은 아래와 같은 기능이 코드로 구현되어 있습니다.
1. requirements : 필요 라이브러리
2. arguments : 필요 설정 값
3. data load : 학습용 데이터 로드
4. train/validset define : 학습/검증용 데이터셋 정의
5. model define & training : AI 모델 정의 및 학습
6. model evaulation : validset을 대상으로 AI 모델 성능 평가
7. model save : 최적의 모델 저장

### requirements

In [56]:
import os
import argparse
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score

### arguments

In [57]:
parser = argparse.ArgumentParser(description='classification using wine data')

# base
parser.add_argument('--random_state', type=int, default=0, help='랜덤 시드값')

# data
parser.add_argument('--data_dir', type=str, default='./dataset', help='데이터 경로')
parser.add_argument('--file_name', type=str, default='train', help='데이터 파일 이름')
parser.add_argument('--target', type=str, default='quality', help='타겟 컬럼명')

# train setting
parser.add_argument('--train_ratio', type=float, default=0.6, help='전체 데이터셋 중 학습 데이터셋 비율')
parser.add_argument('--ckpt_dir', type=str, default='./save_dir', help='모델 저장 경로')

args = parser.parse_args('')  # running in ipynb

### data load

In [58]:
# 데이터셋 로드
data = pd.read_csv(os.path.join(args.data_dir, f'{args.file_name}.csv'))

# 데이터셋 샘플 확인
data.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


### train/validset define

In [59]:
# 독립변수/종속변수 정의
X = data.drop([args.target], axis=1)
y = data[args.target]

# 학습(train)/검증(valid) 데이터셋 정의
X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      test_size=(1-args.train_ratio),
                                                      shuffle=False,
                                                      random_state=args.random_state)

In [60]:
print(f"학습용 데이터셋 - X shape:{X_train.shape} / y shape:{y_train.shape}")
print(f"검증용 데이터셋 - X shape:{X_valid.shape} / y shape:{y_valid.shape}")

학습용 데이터셋 - X shape:(959, 11) / y shape:(959,)
검증용 데이터셋 - X shape:(640, 11) / y shape:(640,)


### model define & training

In [61]:
# 램덤포레스트 모델 정의
model = RandomForestClassifier(n_estimators=100, max_depth=None, max_leaf_nodes=None)

# 랜덤포레스트 모델 학습
model.fit(X_train, y_train)

RandomForestClassifier()

### model evaluation

In [62]:
# prediction
y_pred = model.predict(X_test)

# evaluation by metrics
result = {'acc': accuracy_score(y_test, y_pred),
          'f1': f1_score(y_test, y_pred, average='macro')}

result

{'acc': 0.603125, 'f1': 0.29224009370452886}

### model save

In [63]:
model_save_dir = os.path.join(args.ckpt_dir, "model")
os.makedirs(model_save_dir, exist_ok=True)

joblib.dump(model, os.path.join(model_save_dir, "best_model.joblib"))

['./save_dir/model/best_model.joblib']