# Import

In [1]:
import os
import pandas as pd
import random
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) 

  from .autonotebook import tqdm as notebook_tqdm


# Data Load

In [2]:
train = pd.read_csv('../new_open/train_merge.csv')#.drop(columns=['SAMPLE_ID'])
test = pd.read_csv('../new_open/test_merge.csv')#.drop(columns=['SAMPLE_ID'])

# Data Pre-processing

In [3]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

# Autogluon

In [4]:
label = 'CI_HOUR'
eval_metric = 'mean_absolute_error'

In [5]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      #num_stack_levels=3,
      #excluded_model_types = excluded_model_types,
      num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels\ag-20231017_105118\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231017_105118\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   184.22 GB / 999.46 GB (18.4%)
Train Data Rows:    391939
Train Data Columns: 39
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    54549.41 MB
	Train Data (Original)  Memory Usage: 122.29 MB (0.2% of 

In [6]:
predictor.leaderboard(silent = True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-43.31606,277.498571,7062.286988,0.006207,6.192359,3,True,22
1,NeuralNetTorch_BAG_L2,-43.341391,276.38573,7031.995401,1.886884,1846.195425,2,True,20
2,WeightedEnsemble_L2,-45.776251,51.991836,3650.331858,0.007041,7.55021,2,True,12
3,XGBoost_BAG_L2,-45.927059,275.60548,5209.899204,1.106634,24.099228,2,True,19
4,NeuralNetTorch_BAG_L1,-46.173386,1.299429,2878.723872,1.299429,2878.723872,1,True,10
5,NeuralNetFastAI_BAG_L2,-48.455087,276.98391,5918.997032,2.485064,733.197057,2,True,18
6,LightGBMLarge_BAG_L2,-49.890076,277.218678,5232.743687,2.719832,46.943711,2,True,21
7,LightGBM_BAG_L2,-49.950901,276.524127,5201.582129,2.025281,15.782153,2,True,14
8,LightGBMXT_BAG_L2,-50.290398,280.05075,5228.815852,5.551904,43.015876,2,True,13
9,CatBoost_BAG_L2,-50.314904,274.60227,5264.72916,0.103424,78.929184,2,True,16


In [7]:
predictor.feature_importance(train_data) 

Computing feature importance via permutation shuffling for 39 features using 5000 rows with 5 shuffle sets...
	2398.73s	= Expected runtime (479.75s per shuffle set)
	876.56s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
DIST,36.915184,1.251546,1.58301e-07,5,39.492135,34.338232
PORT_SIZE,14.682544,1.211419,5.510897e-06,5,17.176873,12.188215
AIR_TEMPERATURE,13.368896,0.689372,8.454265e-07,5,14.788323,11.94947
month_sin,9.324443,0.502471,1.007981e-06,5,10.359037,8.289849
DEADWEIGHT,7.912079,0.927541,2.225537e-05,5,9.821899,6.00226
year,7.531015,0.829934,1.741572e-05,5,9.23986,5.82217
SHIP_TYPE_CATEGORY,6.295381,0.233828,2.279733e-07,5,6.776837,5.813925
GT,5.837994,0.274169,5.819988e-07,5,6.402511,5.273477
ARI_CO,5.433692,0.305769,1.198237e-06,5,6.063275,4.80411
ARI_PO,3.665952,0.096449,5.744085e-08,5,3.864542,3.467363


In [8]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

In [9]:
pred_y = np.where(model_pred < 0, 0, model_pred)
pred_y

array([3.6596408e+00, 1.5580181e+01, 2.6396349e+01, ..., 2.5073267e+01,
       1.2788567e-02, 6.0033829e+01], dtype=float32)

In [11]:
submit = pd.read_csv('../new_open/sample_submission.csv')
submit['CI_HOUR'] = pred_y
submit.to_csv('../Sub/autogluon_new_1.csv', index=False)