패키지 목록

In [47]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from supervised.automl import AutoML
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt

SAMPLE_ID열 삭제

In [48]:
train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

## EDA

In [49]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year']=df['ATA'].dt.year
    df['month']=df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

#주말은 1 평일은 0
train['WEEKEND'] = train['weekday'].apply(lambda x: 1 if x >= 5 else 0)
test['WEEKEND'] = test['weekday'].apply(lambda x: 1 if x >= 5 else 0)

# datetime 컬럼 제거
train.drop(columns=['ATA','ID','SHIPMANAGER','FLAG'], inplace=True)
test.drop(columns=['ATA','ID','SHIPMANAGER','FLAG'], inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO','SHIP_TYPE_CATEGORY']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"): #tqdm? 진행상황 표시 
    le = LabelEncoder() # 카테고리 데이터들을 수치형 데이터로 변환
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_) # 클래스 집합 생성
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist() # 객체 클래스 리스트 생성
    bisect.insort_left(le_classes, '-1') # -1을 리스트에 삽입
    le.classes_ = np.array(le_classes) 
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

Encoding features: 100%|██████████| 3/3 [00:00<00:00,  5.37it/s]


새로운 feature 생성

In [50]:
train['VOLUME'] = train['BREADTH'] * train['LENGTH'] * train['DEPTH']
test['VOLUME'] = test['BREADTH'] * test['LENGTH'] * test['DEPTH']

결측치 처리

In [51]:
scaler = MinMaxScaler()
scale_list = ['ARI_CO','ARI_PO','ATA_LT','year','month','day','hour','minute']
train[scale_list] = scaler.fit_transform(train[scale_list])
test[scale_list] = scaler.fit_transform(test[scale_list])

In [52]:
# 결측치가 있는 행과 없는 행 분리
X_missing = train[train.isnull().any(axis=1)]
X_no_missing = train.dropna()
Y_missing = test[test.isnull().any(axis=1)]
Y_no_missing = test.dropna()
# 결측치가 없는 데이터의 특성과 타겟 분리
y_train = X_no_missing[['BN','AIR_TEMPERATURE','U_WIND','V_WIND']]
X_train = X_no_missing[scale_list]
y_train2 = Y_no_missing[['BN','AIR_TEMPERATURE','U_WIND','V_WIND']]
X_train2 = Y_no_missing[scale_list]

# 결측치가 있는 데이터에서 예측할 특성 분리
X_missing_test = X_missing[scale_list]
Y_missing_test = Y_missing[scale_list]

# KNN 회귀 모델 생성 및 훈련
knn_regressor = KNeighborsRegressor(n_neighbors=3)
knn_regressor2 = KNeighborsRegressor(n_neighbors=3)
knn_regressor.fit(X_train, y_train)
knn_regressor2.fit(X_train2,y_train2)

# 결측치 예측
predicted_values = knn_regressor.predict(X_missing_test)
predicted_values2 = knn_regressor2.predict(Y_missing_test)
# BN 열과 air_temperature 열의 결측치 인덱스 찾기
bn_nan_indices = train.index[train['BN'].isna()]
air_temp_nan_indices = train.index[train['AIR_TEMPERATURE'].isna()]
U_WIND_nan_indices = train.index[train['U_WIND'].isna()]
V_WIND_nan_indices = train.index[train['V_WIND'].isna()]
bn_nan_indices_test = test.index[test['BN'].isna()]
air_temp_nan_indices_test = test.index[test['AIR_TEMPERATURE'].isna()]
U_WIND_nan_indices_test = test.index[test['U_WIND'].isna()]
V_WIND_nan_indices_test = test.index[test['V_WIND'].isna()]

# BN 열의 결측치 채우기
for i, idx in enumerate(bn_nan_indices):
    if i < len(predicted_values):  # 배열의 길이를 초과하지 않도록 확인
        train.at[idx, 'BN'] = predicted_values[i, 0]

for i, idx in enumerate(bn_nan_indices_test):
    if i < len(predicted_values2):  # 배열의 길이를 초과하지 않도록 확인
        test.at[idx, 'BN'] = predicted_values2[i, 0]

# air_temperature 열의 결측치 채우기
for i, idx in enumerate(air_temp_nan_indices):
    if i < len(predicted_values):  # 배열의 길이를 초과하지 않도록 확인
        train.at[idx, 'AIR_TEMPERATURE'] = predicted_values[i, 1]

# air_temperature 열의 결측치 채우기
for i, idx in enumerate(air_temp_nan_indices_test):
    if i < len(predicted_values2):  # 배열의 길이를 초과하지 않도록 확인
        test.at[idx, 'AIR_TEMPERATURE'] = predicted_values2[i, 1]
        
# U_WIND 열의 결측치 채우기
for i, idx in enumerate(U_WIND_nan_indices):
    if i < len(predicted_values):  # 배열의 길이를 초과하지 않도록 확인
        train.at[idx, 'U_WIND'] = predicted_values[i, 2]

# U_WIND 열의 결측치 채우기
for i, idx in enumerate(U_WIND_nan_indices_test):
    if i < len(predicted_values2):  # 배열의 길이를 초과하지 않도록 확인
        test.at[idx, 'U_WIND'] = predicted_values2[i, 2]
        
# V_WIND 열의 결측치 채우기
for i, idx in enumerate(V_WIND_nan_indices):
    if i < len(predicted_values):  # 배열의 길이를 초과하지 않도록 확인
        train.at[idx, 'V_WIND'] = predicted_values[i, 3]
        
# V_WIND 열의 결측치 채우기
for i, idx in enumerate(V_WIND_nan_indices_test):
    if i < len(predicted_values2):  # 배열의 길이를 초과하지 않도록 확인
        test.at[idx, 'V_WIND'] = predicted_values2[i, 3]
                

In [53]:
X_train = train.drop(columns='CI_HOUR')
Y_train = train['CI_HOUR']

이상치가 있는 데이터는 RobustScaler 다른 데이터들은 MinMaxScaler 적용

In [14]:
for feature in X_train.columns:
    plt.figure(figsize=(6, 4))  # 그래프의 크기 설정
    plt.boxplot(X_train[feature])  # 현재 feature에 대한 boxplot 그리기
    plt.title(f'Boxplot of {feature} in Train Data')  # 그래프 제목 설정
    plt.show()

In [54]:
isang_scaler = RobustScaler()
scaler = MinMaxScaler()

isang = ['DIST', 'BUILT', 'DEADWEIGHT', 'GT', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN']

In [55]:
no_isang = []
for feature in X_train.columns:
    if feature not in isang:
        no_isang.append(feature)

In [56]:
# isang 그룹 스케일링
X_train_isang_scaled = isang_scaler.fit_transform(X_train[isang])
test_isang_scaled = isang_scaler.transform(test[isang])

# no_isang 그룹 스케일링
X_train_no_isang_scaled = scaler.fit_transform(X_train[no_isang])
test_no_isang_scaled = scaler.transform(test[no_isang])

# 스케일링된 데이터를 DataFrame으로 변환
X_train_isang_scaled_df = pd.DataFrame(X_train_isang_scaled, columns=isang, index=X_train.index)
X_train_no_isang_scaled_df = pd.DataFrame(X_train_no_isang_scaled, columns=no_isang, index=X_train.index)

test_isang_scaled_df = pd.DataFrame(test_isang_scaled, columns=isang, index=test.index)
test_no_isang_scaled_df = pd.DataFrame(test_no_isang_scaled, columns=no_isang, index=test.index)

# 스케일링된 데이터프레임을 합쳐서 최종 트레인/테스트 데이터 생성
X_train = pd.concat([X_train_isang_scaled_df, X_train_no_isang_scaled_df], axis=1)
test = pd.concat([test_isang_scaled_df, test_no_isang_scaled_df], axis=1)

모델 훈련

In [57]:
automl = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM", "Random Forest"],
    mode="Compete",
    ml_task="regression",
    eval_metric='mae',
    random_state=42,
    total_time_limit=None,
    model_time_limit=None
)

In [58]:
automl.fit(X_train, Y_train)

AutoML directory: AutoML_2
The task is regression with evaluation metric mae
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'LightGBM', 'Random Forest']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 4 models
1_Default_LightGBM mae 53.675401 trained in 220.68 seconds
2_Default_Xgboost mae 55.539808 trained in 100.62 seconds
3_Default_CatBoost mae 56.135599 trained in 296.71 seconds
4_Default_RandomForest mae 61.949107 trained in 3098.55 seconds
* Step not_so_random will try to check up to 36 models
14_LightGBM mae 56.090778 trained in 41.95 seconds
5_Xgboost mae 56.375023 trained in 41.85 seconds
23_Cat



29_CatBoost_KMeansFeatures mae 46.431105 trained in 581.91 seconds




31_CatBoost_KMeansFeatures mae 46.49035 trained in 4073.42 seconds




28_CatBoost_KMeansFeatures mae 46.698498 trained in 1598.05 seconds
* Step insert_random_feature will try to check up to 1 model
29_CatBoost_GoldenFeatures_RandomFeature mae 46.050397 trained in 540.08 seconds
Drop features ['hour', 'minute', 'U_WIND', 'random_feature', 'V_WIND', 'BN', 'DIST_ratio_hour', 'ATA_LT', 'DIST_ratio_minute']
* Step features_selection will try to check up to 4 models
29_CatBoost_GoldenFeatures_SelectedFeatures mae 45.337319 trained in 600.37 seconds
21_LightGBM_SelectedFeatures mae 52.955356 trained in 99.9 seconds
7_Xgboost_SelectedFeatures mae 54.711089 trained in 108.07 seconds
38_RandomForest_SelectedFeatures mae 59.811599 trained in 161.49 seconds
* Step hill_climbing_1 will try to check up to 17 models
41_CatBoost_GoldenFeatures_SelectedFeatures mae 45.398516 trained in 1074.82 seconds
42_CatBoost_GoldenFeatures mae 45.979749 trained in 974.87 seconds
43_CatBoost mae 46.088188 trained in 760.48 seconds
44_LightGBM_SelectedFeatures mae 53.113954 trained i

예측 

In [None]:
pred = automl.predict(test)

In [68]:
submit = pd.read_csv('./sample_submission.csv')
submit['CI_HOUR'] = pred
submit.loc[submit['CI_HOUR']<0,'CI_HOUR']=0
submit.to_csv('./baseline_submit.csv', index=False)