# Import

In [1]:
import os
import pandas as pd
import random
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) 

  from .autonotebook import tqdm as notebook_tqdm


# Data Load

In [2]:
train = pd.read_csv('../new_open/train_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])
test = pd.read_csv('../new_open/test_merge_new_fillna.csv')#.drop(columns=['SAMPLE_ID'])

# Data Pre-processing

In [3]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

# Autogluon

In [4]:
label = 'CI_HOUR'
eval_metric = 'mean_absolute_error'

In [5]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      num_stack_levels=3,
      #excluded_model_types = excluded_model_types,
      num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels\ag-20231027_154220\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231027_154220\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   205.90 GB / 999.46 GB (20.6%)
Train Data Rows:    391939
Train Data Columns: 39
Label Column: CI_HOUR
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    56576.16 MB
	Train Data (Original)  Memory Usage: 122.29 MB (0.2% of 

In [8]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

In [9]:
pred_y = np.where(model_pred < 0, 0, model_pred)
pred_y

array([4.0620503e+00, 2.6316820e+01, 2.2498863e+01, ..., 8.4081512e+01,
       7.5976555e+01, 1.3036174e-02], dtype=float32)

# Submission

In [11]:
submit = pd.read_csv('../new_open/sample_submission.csv')
submit['CI_HOUR'] = pred_y

In [12]:
submit['DIST'] = test['DIST'].apply(lambda x: 0 if x==0 else 1)
submit['CI_HOUR'] = submit['CI_HOUR']*submit['DIST']
submit.drop(['DIST'],axis=1,inplace=True)
submit

Unnamed: 0,SAMPLE_ID,CI_HOUR
0,TEST_000000,4.062050
1,TEST_000001,26.316820
2,TEST_000002,22.498863
3,TEST_000003,38.439987
4,TEST_000004,0.000000
...,...,...
220486,TEST_220486,0.000000
220487,TEST_220487,0.000000
220488,TEST_220488,84.081512
220489,TEST_220489,75.976555


In [13]:
submit.to_csv('../Sub/autogluon_stack3_new_fill.csv', index=False)