In [4]:
pip install fancyimpute

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting cvxpy (from fancyimpute)
  Downloading cvxpy-1.4.2-cp39-cp39-win_amd64.whl.metadata (9.0 kB)
Collecting cvxopt (from fancyimpute)
  Downloading cvxopt-1.3.2-cp39-cp39-win_amd64.whl.metadata (1.4 kB)
Collecting osqp>=0.6.2 (from cvxpy->fancyimpute)
  Downloading osqp-0.6.5-cp39-cp39-win_amd64.whl.metadata (1.8 kB)
Collecting ecos>=2 (from cvxpy->fancyimpute)
  Downloading ecos-2.0.13-cp39-cp39-win_amd64.whl.metadata (8.2 kB)
Collecting clarabel>=0.5.0 (from cvxpy->fancyimpute)
  Downloading clarabel-0.7.1-cp37-abi3-win_amd64.whl.metadata (4.7 kB)
Collecting scs>=3.0 (from cvxpy->fancyimpute)
  Downloading scs-3.2.4.

DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063


Downloading cvxopt-1.3.2-cp39-cp39-win_amd64.whl (12.8 MB)
   ---------------------------------------- 12.8/12.8 MB 2.4 MB/s eta 0:00:00
Downloading cvxpy-1.4.2-cp39-cp39-win_amd64.whl (1.0 MB)
   ---------------------------------------- 1.0/1.0 MB 2.5 MB/s eta 0:00:00
Downloading clarabel-0.7.1-cp37-abi3-win_amd64.whl (321 kB)
   ---------------------------------------- 321.5/321.5 kB 3.3 MB/s eta 0:00:00
Downloading ecos-2.0.13-cp39-cp39-win_amd64.whl (72 kB)
   ---------------------------------------- 72.0/72.0 kB 3.9 MB/s eta 0:00:00
Downloading osqp-0.6.5-cp39-cp39-win_amd64.whl (293 kB)
   ---------------------------------------- 293.1/293.1 kB 3.6 MB/s eta 0:00:00
Downloading scs-3.2.4.post1-cp39-cp39-win_amd64.whl (8.4 MB)
   ---------------------------------------- 8.4/8.4 MB 2.5 MB/s eta 0:00:00
Downloading pybind11-2.12.0-py3-none-any.whl (234 kB)
   ---------------------------------------- 235.0/235.0 kB 3.6 MB/s eta 0:00:00
Downloading qdldl-0.1.7.post1-cp39-cp39-win_amd64

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import seaborn as sns

train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

In [2]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['hour'] = df['ATA'].dt.hour
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns=['ATA','ID','SHIPMANAGER','FLAG'], inplace=True)
test.drop(columns=['ATA','ID','SHIPMANAGER','FLAG'], inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"): #tqdm? 진행상황 표시 
    le = LabelEncoder() # 카테고리 데이터들을 수치형 데이터로 변환
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_) # 클래스 집합 생성
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist() # 객체 클래스 리스트 생성
    bisect.insort_left(le_classes, '-1') # -1을 리스트에 삽입
    le.classes_ = np.array(le_classes) 
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

train = pd.get_dummies(train,columns=['SHIP_TYPE_CATEGORY'])
test = pd.get_dummies(test,columns=['SHIP_TYPE_CATEGORY'])
# [배 수치항목서 결측치 존재하는 행 제거]
train.dropna(subset=['LENGTH'], inplace=True)
# BUILT,U_WIND,V_WIND 열 제거
train.drop(columns=['BUILT'],axis=1,inplace=True)
test.drop(columns=['BUILT'],axis=1,inplace=True)

Encoding features: 100%|██████████| 2/2 [00:00<00:00,  2.53it/s]


In [3]:
train.isna().sum()

ARI_CO                               0
ARI_PO                               0
DIST                                 0
BREADTH                              0
DEADWEIGHT                           0
DEPTH                                0
DRAUGHT                              0
GT                                   0
LENGTH                               0
U_WIND                          163688
V_WIND                          163688
AIR_TEMPERATURE                 164630
BN                              163688
ATA_LT                               0
PORT_SIZE                            0
CI_HOUR                              0
hour                                 0
weekday                              0
SHIP_TYPE_CATEGORY_Bulk              0
SHIP_TYPE_CATEGORY_Cargo             0
SHIP_TYPE_CATEGORY_Container         0
SHIP_TYPE_CATEGORY_Tanker            0
dtype: int64

In [4]:
train['WEEKEND'] = train['weekday'].apply(lambda x: 1 if x >= 5 else 0)
train.drop(columns = ['weekday'], inplace = True)

In [5]:
train['WIND_INTENSITY'] = np.sqrt(train['U_WIND']**2 + train['V_WIND']**2)
train['U_WIND_SQUARE'] = train['U_WIND'] ** 2
train['V_WIND_SQUARE'] = train['V_WIND'] ** 2
train['VOLUME'] = train['BREADTH'] * train['LENGTH'] * train['DEPTH']
train['WIND_DIRECTION'] = np.arctan2(train['V_WIND'], train['U_WIND']) * (180/np.pi)
train['WIND_DIRECTION'] = train['WIND_DIRECTION'].apply(lambda x: x+360 if x < 0 else x)
train['WIND_SPEED_DIR'] = train['WIND_INTENSITY'] * train['WIND_DIRECTION']

In [6]:
from fancyimpute import IterativeImputer
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
from sklearn.preprocessing import RobustScaler

# MICE 모델 생성
mice_imputer = IterativeImputer()



# 독립 변수들과 종속 변수 설정
X = train.drop(columns=['CI_HOUR'])  # 독립 변수들
y = train['CI_HOUR']  # 종속 변수

X['DIST'] = X['DIST']+1
# 로그 스케일링
X['DIST'] = np.log(X['DIST'])

# train 데이터와 validation 데이터로 나누기 (80% train, 20% validation)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 결측치를 보완하여 새로운 train 데이터 프레임 생성
mice_imputer.fit(X_train)

# train 데이터에 대해서만 fit_transform을 사용하여 결측치 보완
X_train_imputed = mice_imputer.transform(X_train)

# validation 데이터의 결측치 보완
X_valid_imputed = mice_imputer.transform(X_valid)

# 보완된 데이터를 DataFrame으로 변환
X_train = pd.DataFrame(X_train_imputed, columns=X.columns)
X_valid = pd.DataFrame(X_valid_imputed, columns=X.columns)

traindro = X_train.drop(columns = ['DIST'])
validdro = X_valid.drop(columns = ['DIST'])

traindist = X_train['DIST']
validdist = X_valid['DIST']

scaler = RobustScaler()

scaler.fit(traindro)

cols = traindro.columns

scaler.transform(traindro)
scaler.transform(validdro)

traindro = pd.DataFrame(scaler.transform(traindro), columns = cols)
validdro = pd.DataFrame(scaler.transform(validdro), columns = cols)

X_train = pd.concat([traindro, traindist], axis=1)
X_valid = pd.concat([validdro, validdist], axis=1)


# CatBoostRegressor 모델 생성 및 학습
model = XGBRegressor()
model.fit(X_train, y_train)

# validation 데이터에 대한 예측
y_pred = model.predict(X_valid)

# MAE 계산 및 출력
mae = mean_absolute_error(y_valid, y_pred)
print("Validation MAE:", mae)


(CVXPY) Apr 06 04:30:48 PM: Encountered unexpected exception importing solver CVXOPT:
ImportError('DLL load failed while importing base: 지정된 모듈을 찾을 수 없습니다.')
(CVXPY) Apr 06 04:30:48 PM: Encountered unexpected exception importing solver GLPK:
ImportError('DLL load failed while importing base: 지정된 모듈을 찾을 수 없습니다.')
(CVXPY) Apr 06 04:30:48 PM: Encountered unexpected exception importing solver GLPK_MI:
ImportError('DLL load failed while importing base: 지정된 모듈을 찾을 수 없습니다.')




Validation MAE: 56.72259273078394
