In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import seaborn as sns

train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

In [None]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"): #tqdm? 진행상황 표시 
    le = LabelEncoder() # 카테고리 데이터들을 수치형 데이터로 변환
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_) # 클래스 집합 생성
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist() # 객체 클래스 리스트 생성
    bisect.insort_left(le_classes, '-1') # -1을 리스트에 삽입
    le.classes_ = np.array(le_classes) 
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

# 결측치 처리
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV


X = train.drop(columns = ['CI_HOUR'])
y = train['CI_HOUR']
k = 5

# Define parameter grid for each model
param_grid_tree = {
    'max_depth': [3, 5, 7]
}

param_grid_forest = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7]
}

param_grid_cat = {
    'iterations': [50, 100, 150],
    'depth': [3, 5, 7]
}

param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7]
}

# Models
models = {
    'DecisionTree': (DecisionTreeRegressor(), param_grid_tree),
    'RandomForest': (RandomForestRegressor(), param_grid_forest),
    'CatBoost': (CatBoostRegressor(), param_grid_cat),
    'XGBoost': (XGBRegressor(), param_grid_xgb)
}

# Perform grid search for each model
best_estimators = {}
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=k, scoring='neg_mean_absolute_error')
    grid_search.fit(X, y)
    best_estimators[name] = grid_search.best_estimator_

# Print best parameters for each model
for name, estimator in best_estimators.items():
    print(f'Best parameters for {name}: {estimator.get_params()}')

In [None]:
# Models
model_tree = DecisionTreeRegressor()
model_forest = RandomForestRegressor()
model_cat = CatBoostRegressor()
model_xgb = XGBRegressor()

# Hyper Parameters
k = 5
models = [model_tree, model_forest, model_cat, model_xgb]


kfold = KFold(n_splits = k)

n_iter = 0
models_mae = []
for model in models:
    cv_mae = []
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        n_iter += 1
        mae = mean_absolute_error(y_test, pred)
        train_size = X_train.shape[0]
        test_size = X_test.shape[0]
        print('\n{0}#{1} 교차검증 MAE :{2}, 학습 데이터 크기: {3}, 검증 데이터 크기: {4}'.format(model,n_iter, mae, train_size, test_size))
        print('{0}#{1} 검증 세트 인덱스 :{2}'.format(model, n_iter, test_index))
        cv_mae.append(mae)
    print(f'## {model}의 평균 검증 MAE: {np.mean(cv_mae)}')
    models_mae.append(np.mean(cv_mae))