# Import

In [1]:
import os
import pandas as pd
import random
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) 

  from .autonotebook import tqdm as notebook_tqdm


# Data Load

In [2]:
train = pd.read_csv('../open/train.csv').drop(columns=['SAMPLE_ID'])

test = pd.read_csv('../open/test.csv').drop(columns=['SAMPLE_ID'])

# Data Pre-processing

In [3]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

In [4]:
# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']


for feature in tqdm(categorical_features, desc="Encoding features"):
    encoder = LabelEncoder()
    train[feature] = encoder.fit_transform(train[feature])
    for label in np.unique(test[feature]):
        if label not in encoder.classes_:
            encoder.classes_ = np.append(encoder.classes_, label)
    test[feature] = encoder.transform(test[feature])

# 결측치 처리
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

Encoding features: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:10<00:00,  1.82s/it]


In [5]:
train['OIL'] = train['WTI']+train['DUBAI']+train['BRENT']
test['OIL'] = test['WTI']+test['DUBAI']+test['BRENT']

In [6]:
train_importance_features = ['WTI', 'DUBAI', 'BRENT','OIL', 'month', 'day', 'BDI_ADJ','year','CI_HOUR']
test_importance_features = ['WTI', 'DUBAI', 'BRENT','OIL', 'month', 'day', 'BDI_ADJ','year']

In [7]:
train = train[train_importance_features]
test = test[test_importance_features]

In [8]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

# Autogluon

In [9]:
label = 'CI_HOUR'
eval_metric = 'mean_absolute_error'

In [10]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      #num_stack_levels=3,
      #excluded_model_types = excluded_model_types,
      num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels\ag-20231015_083456\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231015_083456\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   203.68 GB / 999.46 GB (20.4%)
Train Data Rows:    220055
Train Data Columns: 8
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    56279.75 MB
	Train Data (Original)  Memory Usage: 14.08 MB (0.0% of av

In [11]:
predictor.leaderboard(silent = True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-9.716353,1890.540663,10375.566234,0.003,3.244987,3,True,22
1,RandomForestMSE_BAG_L2,-9.870224,1467.724764,8931.616493,7.400028,229.070027,2,True,15
2,LightGBMLarge_BAG_L2,-10.023166,1883.137636,10143.251219,422.812899,1440.704753,2,True,21
3,ExtraTreesMSE_BAG_L2,-10.12435,1466.59293,8728.251807,6.268193,25.705341,2,True,17
4,XGBoost_BAG_L2,-10.512982,1472.195796,9066.185633,11.871059,363.639167,2,True,19
5,LightGBM_BAG_L2,-10.98334,1462.844789,8725.171696,2.520052,22.62523,2,True,14
6,CatBoost_BAG_L2,-11.49973,1460.410741,8837.067719,0.086004,134.521253,2,True,16
7,LightGBMXT_BAG_L2,-12.0614,1868.90524,9071.408264,408.580504,368.861798,2,True,13
8,NeuralNetTorch_BAG_L2,-12.238715,1461.145198,9873.711644,0.820462,1171.165178,2,True,20
9,NeuralNetFastAI_BAG_L2,-12.501081,1462.577885,9418.850929,2.253149,716.304463,2,True,18


In [12]:
predictor.feature_importance(train_data) 

Computing feature importance via permutation shuffling for 8 features using 5000 rows with 5 shuffle sets...
	1721.4s	= Expected runtime (344.28s per shuffle set)
	756.12s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
month,940.737553,16.381477,1.10292e-08,5,974.467242,907.007865
DUBAI,850.429598,11.924667,4.637661e-09,5,874.982653,825.876543
WTI,841.952225,8.620641,1.318648e-09,5,859.702243,824.202206
BDI_ADJ,838.793221,6.990974,5.789914e-10,5,853.187733,824.398709
year,837.225778,10.454654,2.917153e-09,5,858.752055,815.699502
BRENT,764.631709,4.940292,2.091021e-10,5,774.803838,754.459579
OIL,693.89149,10.005778,5.186783e-09,5,714.493526,673.289454
day,184.866021,1.399382,3.939714e-10,5,187.747368,181.984674


In [13]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

In [14]:
pred_y = np.where(model_pred < 0, 0, model_pred)
pred_y

array([  94.205536,  378.5741  ,    8.05019 , ...,    7.673829,
          9.269344, 1161.8945  ], dtype=float32)

In [15]:
submit = pd.read_csv('../open/sample_submission.csv')
submit['CI_HOUR'] = pred_y
submit.to_csv('../Sub/autogluon_1.csv', index=False)