In [1]:
import pandas as pd
import boto3

s3 = boto3.client('s3')

# s3에서 데이터 로드
df = pd.read_csv('s3a://testawsbucket-01/output/merged_df.csv/part-00000-bbb9f4ff-1108-4b02-96b3-7188aa8d7ccf-c000.csv', low_memory=False)
print(df)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



                id_job    id_array_job  id_array_task         id_user  \
0       65590191436871  14108987335445            114  87509498710061   
1       64310074400647  61177129314629            600  42770088536256   
2       34191949612627  14108987335445            115  87509498710061   
3       21630303188597  61177129314629            601  42770088536256   
4       10343100598054  61177129314629            602  42770088536256   
...                ...             ...            ...             ...   
395909  75124122894379  16618712154521     4294967294   1706828023724   
395910  37802476679519  16618712154521     4294967294   1706828023724   
395911   9807128696900  38040778438207            109  48065336140816   
395912  42865228158509  38040778438207            110  48065336140816   
395913  36690157579853  38040778438207            111  48065336140816   

           kill_requid  nodes_alloc  cpus_req  derived_ec  exit_code  \
0       61026541062099            1        20      

In [2]:
# 결측치 채우기

df['gres_used'] = df['gres_used'].fillna(0)
df['model'] = df['model'].fillna('NONE')

In [3]:
# 범주형 변수 원-핫 인코딩
df = pd.get_dummies(df, columns=['constraints', 'partition', 'job_type'])


In [4]:
# 컬럼 이름 정제
df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

In [5]:
from sklearn.preprocessing import LabelEncoder

# 'model' 컬럼 라벨 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['model'])
df = df.drop('model', axis=1)

In [6]:
from sklearn.model_selection import train_test_split

X = df
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [8]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.3 imblearn-0.0


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import log_loss

# 1. LightGBM
train_data = lgb.Dataset(X_resampled, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

params_lgb = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': len(np.unique(y_train)),
    'learning_rate': 0.01,
    'num_leaves': 31
}

lgb_model = lgb.train(
    params_lgb,
    train_data,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=3), lgb.log_evaluation(period=100)]
)


with open('model/oversampling/lgb_model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)
    

# 예측 결과의 확률을 계산
lgb_pred = lgb_model.predict(X_valid, raw_score=False)

# 로그 손실 계산
lgb_log_loss = log_loss(y_valid, lgb_pred)
print(f'LightGBM Log Loss: {lgb_log_loss}')


In [None]:
# 2. XGBoost
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

params_xgb = {
    'objective': 'multi:softprob',
    'num_class': len(np.unique(y_train)),
    'eval_metric': 'mlogloss',
    'max_depth': 6,
    'learning_rate': 0.01
}

xgb_model = xgb.train(params_xgb, dtrain, num_boost_round=800, 
                       evals=[(dvalid, 'eval')],
                       early_stopping_rounds=3, 
                       verbose_eval=True)

with open('model/oversampling/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)


xgb_pred = xgb_model.predict(dvalid)
xgb_log_loss = log_loss(y_valid, xgb_pred)
print(f'XGBoost Log Loss: {xgb_log_loss}')

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import log_loss

# Pool 객체 생성
train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_valid, y_valid)

# CatBoost 모델 설정
cat_model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.01,
    depth=6,
    eval_metric='MultiClass',
    early_stopping_rounds=3,
    verbose=100
)

# 모델 훈련
cat_model.fit(train_pool, eval_set=valid_pool)

# 모델 저장
cat_model.save_model('model/oversampling/catboost_model.cbm')


# 유효성 검증 데이터에 대한 예측 확률 계산
cat_proba = cat_model.predict_proba(X_valid)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, log_loss
import numpy as np

# 예측값을 클래스 레이블로 변환
lgb_pred_probs = lgb_model.predict(X_test)  # LightGBM
lgb_pred_classes = np.argmax(lgb_pred_probs, axis=1)

cat_pred_probs = cat_model.predict_proba(X_test)  # CatBoost
cat_pred_classes = np.argmax(cat_pred_probs, axis=1)

xgb_pred_probs = xgb_model.predict(xgb.DMatrix(X_test))  # XGBoost
xgb_pred_classes = np.argmax(xgb_pred_probs, axis=1)

# 평가 지표 계산
lgb_accuracy = accuracy_score(y_test, lgb_pred_classes)
cat_accuracy = accuracy_score(y_test, cat_pred_classes)
xgb_accuracy = accuracy_score(y_test, xgb_pred_classes)

lgb_f1 = f1_score(y_test, lgb_pred_classes, average='macro')
cat_f1 = f1_score(y_test, cat_pred_classes, average='macro')
xgb_f1 = f1_score(y_test, xgb_pred_classes, average='macro')

lgb_log_loss = log_loss(y_test, lgb_pred_probs)
cat_log_loss = log_loss(y_test, cat_pred_probs)
xgb_log_loss = log_loss(y_test, xgb_pred_probs)


# 결과 출력
print(f"LightGBM Accuracy: {lgb_accuracy}")
print(f"CatBoost Accuracy: {cat_accuracy}")
print(f"XGBoost Accuracy: {xgb_accuracy}")

print(f"LightGBM F1 Score: {lgb_f1}")
print(f"CatBoost F1 Score: {cat_f1}")
print(f"XGBoost F1 Score: {xgb_f1}")

print(f"LightGBM Log Loss: {lgb_log_loss}")
print(f"CatBoost Log Loss: {cat_log_loss}")
print(f"XGBoost Log Loss: {xgb_log_loss}")



In [None]:
import matplotlib.pyplot as plt

# Confusion Matrix
lgb_cm = confusion_matrix(y_test, lgb_pred_classes)
cat_cm = confusion_matrix(y_test, cat_pred_classes)
xgb_cm = confusion_matrix(y_test, xgb_pred_classes)

# Visualize Confusion Matrix
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

ConfusionMatrixDisplay(confusion_matrix=lgb_cm).plot(ax=ax[0])
ax[0].set_title('LightGBM Confusion Matrix')

ConfusionMatrixDisplay(confusion_matrix=cat_cm).plot(ax=ax[1])
ax[1].set_title('CatBoost Confusion Matrix')

ConfusionMatrixDisplay(confusion_matrix=xgb_cm).plot(ax=ax[2])
ax[2].set_title('XGBoost Confusion Matrix')

plt.tight_layout()
plt.show()