## Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from datetime import datetime
from holidays import country_holidays
from xgboost import XGBRegressor 
import xgboost as xgb 
from tqdm import tqdm

import optuna 
from optuna.samplers import TPESampler

from supervised.automl import AutoML

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn import base
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [2]:
train_df = pd.read_csv('../../DATA/train.csv')
test_df  = pd.read_csv('../../DATA/test.csv')

In [24]:
test_df.head(30)
[9,9,10,10,10,10,10,10,10,11,11,11,11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]

Unnamed: 0,ID,timestamp,item,corp,loc,year,month,day,weekday,weekofyear,dayofyear,season,weekend
0,TG_A_J_20230304,2023-03-04,TG,A,J,2023,3,4,5,9,63,1,0
1,TG_A_J_20230305,2023-03-05,TG,A,J,2023,3,5,6,9,64,1,1
2,TG_A_J_20230306,2023-03-06,TG,A,J,2023,3,6,0,10,65,1,0
3,TG_A_J_20230307,2023-03-07,TG,A,J,2023,3,7,1,10,66,1,0
4,TG_A_J_20230308,2023-03-08,TG,A,J,2023,3,8,2,10,67,1,0
5,TG_A_J_20230309,2023-03-09,TG,A,J,2023,3,9,3,10,68,1,0
6,TG_A_J_20230310,2023-03-10,TG,A,J,2023,3,10,4,10,69,1,0
7,TG_A_J_20230311,2023-03-11,TG,A,J,2023,3,11,5,10,70,1,0
8,TG_A_J_20230312,2023-03-12,TG,A,J,2023,3,12,6,10,71,1,1
9,TG_A_J_20230313,2023-03-13,TG,A,J,2023,3,13,0,11,72,1,0


## Make Date Data

In [3]:
# 데이터 타입 변경, 열 이름 변경 

new_column_names = {
    'corporation': 'corp',
    'location': 'loc',
    'supply(kg)': 'supply',
    'price(원/kg)': 'price',
}
train_df = train_df.rename(columns=new_column_names)
test_df = test_df.rename(columns=new_column_names)
train_df['timestamp']  = pd.to_datetime(train_df['timestamp'])
test_df['timestamp']  = pd.to_datetime(test_df['timestamp'])


In [5]:
train_df['item_id'] = train_df.ID.str[:6 ]
train_df

Unnamed: 0,ID,timestamp,item,corp,loc,supply,price,item_id
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,TG_A_J
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,TG_A_J
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,TG_A_J
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,TG_A_J
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,TG_A_J
...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,RD_F_J
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,RD_F_J
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,RD_F_J
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,RD_F_J


In [9]:
domains = train_df[['item_id', 'item' ,'corp', 'loc']].drop_duplicates().reset_index(drop=True)
(domains)

Unnamed: 0,item_id,item,corp,loc
0,TG_A_J,TG,A,J
1,TG_A_S,TG,A,S
2,TG_B_J,TG,B,J
3,TG_B_S,TG,B,S
4,TG_C_J,TG,C,J
5,TG_C_S,TG,C,S
6,TG_D_J,TG,D,J
7,TG_D_S,TG,D,S
8,TG_E_J,TG,E,J
9,TG_E_S,TG,E,S


In [21]:
def get_date_info(data) :
    # data의 날짜 정보 추출하기 
    data['year'] = data['timestamp'].dt.year
    data['month'] = data['timestamp'].dt.month
    data['day'] = data['timestamp'].dt.day
    data['weekday'] = data['timestamp'].dt.weekday
    data['weekofyear'] = data['timestamp'].dt.isocalendar().week
    data['dayofyear'] = data['timestamp'].dt.dayofyear #해당 년도의 몇 일째 

def month_to_season(month):
    if month in [12, 1, 2]:
        return 0
    elif month in [3, 4, 5]:
        return 1
    elif month in [6, 7, 8]:
        return 2
    else:
        return 3

    
get_date_info(train_df)
get_date_info(test_df)

# 'month' 칼럼을 정수형으로 변환
train_df['season'] = train_df['month'].astype(int).apply(month_to_season)
test_df['season'] = test_df['month'].astype(int).apply(month_to_season)
test_df['weekend'] = test_df['weekday'].apply(lambda x : 1 if x==6  else 0 )

In [14]:
#known_covariates_names=["weekend", 'year', 'month', 'day', 'weekday', 'weekofyear', 'dayofyear' , 'season' ]
train_df_nodomain = train_df.drop(columns=['item', 'corp','item_id'])
train_df_nodomain

Unnamed: 0,ID,timestamp,loc,supply,price,year,month,day,weekday,weekofyear,dayofyear,season
0,TG_A_J_20190101,2019-01-01,J,0.0,0.0,2019,1,1,1,1,1,0
1,TG_A_J_20190102,2019-01-02,J,0.0,0.0,2019,1,2,2,1,2,0
2,TG_A_J_20190103,2019-01-03,J,60601.0,1728.0,2019,1,3,3,1,3,0
3,TG_A_J_20190104,2019-01-04,J,25000.0,1408.0,2019,1,4,4,1,4,0
4,TG_A_J_20190105,2019-01-05,J,32352.0,1250.0,2019,1,5,5,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,J,452440.0,468.0,2023,2,27,0,9,58,0
59393,RD_F_J_20230228,2023-02-28,J,421980.0,531.0,2023,2,28,1,9,59,0
59394,RD_F_J_20230301,2023-03-01,J,382980.0,574.0,2023,3,1,2,9,60,1
59395,RD_F_J_20230302,2023-03-02,J,477220.0,523.0,2023,3,2,3,9,61,1


In [19]:
train_df_nodomain['weekend'] = train_df_nodomain['weekday'].apply(lambda x : 1 if x == 6 else 0)
train_df_nodomain

Unnamed: 0,ID,timestamp,loc,supply,price,year,month,day,weekday,weekofyear,dayofyear,season,weekend
0,TG_A_J_20190101,2019-01-01,J,0.0,0.0,2019,1,1,1,1,1,0,0
1,TG_A_J_20190102,2019-01-02,J,0.0,0.0,2019,1,2,2,1,2,0,0
2,TG_A_J_20190103,2019-01-03,J,60601.0,1728.0,2019,1,3,3,1,3,0,0
3,TG_A_J_20190104,2019-01-04,J,25000.0,1408.0,2019,1,4,4,1,4,0,0
4,TG_A_J_20190105,2019-01-05,J,32352.0,1250.0,2019,1,5,5,1,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,J,452440.0,468.0,2023,2,27,0,9,58,0,0
59393,RD_F_J_20230228,2023-02-28,J,421980.0,531.0,2023,2,28,1,9,59,0,0
59394,RD_F_J_20230301,2023-03-01,J,382980.0,574.0,2023,3,1,2,9,60,1,0
59395,RD_F_J_20230302,2023-03-02,J,477220.0,523.0,2023,3,2,3,9,61,1,0


In [68]:
# 휴일 정보 추가하니 성능 더 하락 

years  = [2019,2020,2021,2022,2023 ]
holi_dates=[]
for year in years :
    holidayarr = country_holidays('KR', years = year).items()
    for day in holidayarr:
        holi_dates.append(day[0])
    
print(holi_dates[0])

train_df['holi'] = train_df['timestamp'].apply(lambda x: int(x in holi_dates))
test_df['holi'] = test_df['timestamp'].apply(lambda x: int(x in holi_dates))
train_df.loc[(train_df['holi'] == 0) & (train_df['weekday'] >= 6), 'holi'] = 1
test_df.loc[(train_df['holi'] == 0) & (test_df['weekday'] >= 6), 'holi'] = 1


2019-01-01


In [69]:
def trans_fourier(data) :
    #날짜
    data['sin_date'] = -np.sin(2 * np.pi * (data['month']+data['day']/31)/12)
    data['cos_date'] = -np.cos(2 * np.pi * (data['month']+data['day']/31)/12)

    #월
    data['sin_month'] = -np.sin(2 * np.pi * data['month']/12.0)
    data['cos_month'] = -np.cos(2 * np.pi * data['month']/12.0)
    
    data['sin_week'] = -np.sin(2 * np.pi * (data['weekofyear'])/52.0)
    data['cos_week'] = -np.cos(2 * np.pi * (data['weekofyear'])/52.0)

    #요일
    data['sin_dayofweek'] = -np.sin(2 * np.pi * (data['weekday']+1)/7.0)
    data['cos_dayofweek'] = -np.cos(2 * np.pi * (data['weekday']+1)/7.0)
    
    
trans_fourier(train_df)
trans_fourier(test_df)

# Supply Data , New feature

In [70]:
# 아이템, 유통회사, 지역 묶기 
def new_columns(data):
    data['item_corp_loc']= data['item'].astype(str).str.cat([data['corp'].astype(str), data['loc'].astype(str)], sep='')
    # data['item_corp']= data['item'].astype(str).str.cat([data['corp'].astype(str)], sep='')
    data['item_loc']= data['item'].astype(str).str.cat([data['loc'].astype(str)], sep='')
    data['item_loc'] =data['item_loc'].astype(str) 
    # data['item_corp'] = data['item_corp'].astype(str)
    data['item_corp_loc']= data['item_corp_loc'].astype(str)
    
    data['y_it_loc_m'] = data['item'].str.cat([data['loc'], data['year'].astype(str),data['month'].astype(str)], sep = '')

new_columns(train_df)
new_columns(test_df)

In [73]:
def fill_supply(train, test, start, end):
    train['timestamp'] = pd.to_datetime(train['timestamp'])  
    start_date = pd.to_datetime(start)
    end_date = pd.to_datetime(end)
    mask = (train['timestamp'] >= start_date) & (train['timestamp'] <= end_date)
    filtered_train_df = train.loc[mask]

    result = filtered_train_df.groupby('item_corp_loc')['supply'].median()
    result = result.reset_index().rename(columns={'supply': 'supply'})

    test['supply'] = 0 
    test = test.set_index('item_corp_loc')
    test.update(result.set_index('item_corp_loc'))
    test = test.reset_index()

    return test 

test_df = fill_supply(train_df, test_df, "2023-02-01","2023-03-03")


# Imputaion Of Data 

# Encoding Categorical Features

In [82]:

class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
   
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):        
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)       
        
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = True)        
        col_mean_name = self.colnames + '_' + 'encod'
        X[col_mean_name] = np.nan       
        
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)  
            
        if self.verbosity:            
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName,                                                                                    np.corrcoef(X[self.targetName].values,encoded_feature)[0][1]))

        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X

In [83]:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    def __init__(self,train,colNames,encodedName):
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        mean = self.train[[self.colNames,
        self.encodedName]].groupby(
        self.colNames).mean().reset_index()
        dd = {}
        for row in (mean.itertuples(index=False)):
            dd[row[0]] = row[1]
            X[self.encodedName] = X[self.colNames]
            X[self.encodedName] = X[self.encodedName].map(dd.get)
        return X

In [90]:
for col in ['item_corp_loc', 'item_loc' ,'item' ,'corp', 'loc' ,'season','y_it_loc_m']:
    t_encoder = KFoldTargetEncoderTrain(colnames=col , targetName='price',n_fold=5)
    train_df  = t_encoder.fit_transform(train_df)
    
    newcolname = f'{col}_encod'
    
    test_t_encoder = KFoldTargetEncoderTest(train_df, colNames=col , encodedName=newcolname)
    test_df = test_t_encoder.transform(test_df)

Correlation between the new feature, item_corp_loc_encod and, price is 0.6327969090540054.
Correlation between the new feature, item_loc_encod and, price is 0.6139308527488362.
Correlation between the new feature, item_encod and, price is 0.6078361845777633.
Correlation between the new feature, corp_encod and, price is 0.14528901419694895.
Correlation between the new feature, loc_encod and, price is 0.040619659711968695.
Correlation between the new feature, season_encod and, price is 0.14288003987389114.
Correlation between the new feature, y_it_loc_m_encod and, price is 0.7579636381602503.


In [91]:
test_df

Unnamed: 0,item_corp_loc,ID,timestamp,item,corp,loc,year,month,day,weekday,...,item_loc,y_it_loc_m,supply,item_corp_loc_encod,item_loc_encod,item_encod,corp_encod,loc_encod,season_encod,y_it_loc_m_encod
0,TGAJ,TG_A_J_20230304,2023-03-04,TG,A,J,2023,3,4,5,...,TGJ,TGJ20233,10094.8,2567.187865,3006.697432,3177.328604,1141.662687,1062.335718,1531.249461,2580.009091
1,TGAJ,TG_A_J_20230305,2023-03-05,TG,A,J,2023,3,5,6,...,TGJ,TGJ20233,10094.8,2567.187865,3006.697432,3177.328604,1141.662687,1062.335718,1531.249461,2580.009091
2,TGAJ,TG_A_J_20230306,2023-03-06,TG,A,J,2023,3,6,0,...,TGJ,TGJ20233,10094.8,2567.187865,3006.697432,3177.328604,1141.662687,1062.335718,1531.249461,2580.009091
3,TGAJ,TG_A_J_20230307,2023-03-07,TG,A,J,2023,3,7,1,...,TGJ,TGJ20233,10094.8,2567.187865,3006.697432,3177.328604,1141.662687,1062.335718,1531.249461,2580.009091
4,TGAJ,TG_A_J_20230308,2023-03-08,TG,A,J,2023,3,8,2,...,TGJ,TGJ20233,10094.8,2567.187865,3006.697432,3177.328604,1141.662687,1062.335718,1531.249461,2580.009091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087,RDFJ,RD_F_J_20230327,2023-03-27,RD,F,J,2023,3,27,0,...,RDJ,RDJ20233,425040.0,259.681858,252.011007,187.077466,259.133976,1062.335718,1531.249461,476.089610
1088,RDFJ,RD_F_J_20230328,2023-03-28,RD,F,J,2023,3,28,1,...,RDJ,RDJ20233,425040.0,259.681858,252.011007,187.077466,259.133976,1062.335718,1531.249461,476.089610
1089,RDFJ,RD_F_J_20230329,2023-03-29,RD,F,J,2023,3,29,2,...,RDJ,RDJ20233,425040.0,259.681858,252.011007,187.077466,259.133976,1062.335718,1531.249461,476.089610
1090,RDFJ,RD_F_J_20230330,2023-03-30,RD,F,J,2023,3,30,3,...,RDJ,RDJ20233,425040.0,259.681858,252.011007,187.077466,259.133976,1062.335718,1531.249461,476.089610


In [10]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corp', 'loc']

def encode_cols_v1(data,data2, cat_name , target , weight) :
    mean = data[target].mean()
    agg = data.groupby(cat_name)[target].agg(['count','mean'])
    counts =agg['count']
    means  =agg['mean']
    
    smooth = (counts*means + weight*mean)/ (counts+weight)
    data[cat_name] = data[cat_name].map(smooth) 
    data2[cat_name] = data2[cat_name].map(smooth)
    
    return data, data2 
    


for i in qual_col:= -1,total_time_limit=

In [52]:
# scaler로 스케일링 해주기 


In [92]:
train_df.columns

Index(['ID', 'timestamp', 'item', 'corp', 'loc', 'supply', 'price', 'year',
       'month', 'day', 'weekday', 'weekofyear', 'dayofyear', 'season', 'holi',
       'sin_date', 'cos_date', 'sin_month', 'cos_month', 'sin_week',
       'cos_week', 'sin_dayofweek', 'cos_dayofweek', 'item_corp_loc',
       'item_loc', 'y_it_loc_m', 'item_corp_loc_encod', 'item_loc_encod',
       'item_encod', 'corp_encod', 'loc_encod', 'season_encod',
       'y_it_loc_m_encod'],
      dtype='object')

In [93]:
train_df #item', 'corp', 'loc''item_corp_loc',       'item_loc', 'y_it_loc_m',

Unnamed: 0,ID,timestamp,item,corp,loc,supply,price,year,month,day,...,item_corp_loc,item_loc,y_it_loc_m,item_corp_loc_encod,item_loc_encod,item_encod,corp_encod,loc_encod,season_encod,y_it_loc_m_encod
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,...,TGAJ,TGJ,TGJ20191,2633.087276,3012.818407,3178.322804,1126.735991,1065.404933,995.909264,1375.545455
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2,...,TGAJ,TGJ,TGJ20191,2633.087276,3016.098201,3178.322804,1137.849658,1065.404933,995.909264,1375.545455
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3,...,TGAJ,TGJ,TGJ20191,2633.087276,3012.818407,3166.733146,1137.849658,1062.415837,995.909264,1353.169492
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4,...,TGAJ,TGJ,TGJ20191,2592.570502,2997.932320,3178.322804,1156.972594,1056.587411,995.909264,1353.169492
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5,...,TGAJ,TGJ,TGJ20191,2490.053719,3016.098201,3166.733146,1148.000642,1065.404933,982.853998,1375.545455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,27,...,RDFJ,RDJ,RDJ20232,257.356220,253.231214,187.368302,257.593344,1064.215911,982.853998,404.445652
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,28,...,RDFJ,RDJ,RDJ20232,268.434709,252.056015,187.368302,258.532312,1063.117546,982.853998,345.195652
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,1,...,RDFJ,RDJ,RDJ20233,253.277461,252.056015,189.714844,258.532312,1064.215911,1547.448354,442.750000
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,2,...,RDFJ,RDJ,RDJ20233,257.356220,252.056015,189.714844,258.532312,1056.587411,1547.448354,442.750000


In [97]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID','season', 'timestamp', 'price', 'day', 'month', 'weekday','item', 'corp', 'loc','item_corp_loc', 'item_loc', 'y_it_loc_m',])
train_y = train_df['price']
test_x = test_df.drop(columns=['ID', 'season','timestamp' ,  'day', 'month', 'weekday', 'item', 'corp', 'loc','item_corp_loc',   'item_loc', 'y_it_loc_m'])

In [101]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59397 entries, 0 to 59396
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   supply               59397 non-null  float64
 1   year                 59397 non-null  int32  
 2   weekofyear           59397 non-null  UInt32 
 3   dayofyear            59397 non-null  int32  
 4   holi                 59397 non-null  int64  
 5   sin_date             59397 non-null  float64
 6   cos_date             59397 non-null  float64
 7   sin_month            59397 non-null  float64
 8   cos_month            59397 non-null  float64
 9   sin_week             59397 non-null  Float64
 10  cos_week             59397 non-null  Float64
 11  sin_dayofweek        59397 non-null  float64
 12  cos_dayofweek        59397 non-null  float64
 13  item_corp_loc_encod  59397 non-null  float64
 14  item_loc_encod       59397 non-null  float64
 15  item_encod           59397 non-null 

# AutoMl  적용

In [None]:
# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
# autogluon-common 0.8.2 requires pandas<1.6,>=1.4.1, but you have pandas 2.1.2 which is incompatible.
# autogluon-core 0.8.2 requires pandas<1.6,>=1.4.1, but you have pandas 2.1.2 which is incompatible.
# autogluon-features 0.8.2 requires pandas<1.6,>=1.4.1, but you have pandas 2.1.2 which is incompatible.
# autogluon-multimodal 0.8.2 requires pandas<1.6,>=1.4.1, but you have pandas 2.1.2 which is incompatible.
# autogluon-tabular 0.8.2 requires pandas<1.6,>=1.4.1, but you have pandas 2.1.2 which is incompatible.
# autogluon-timeseries 0.8.2 requires pandas<1.6,>=1.4.1, but you have pandas 2.1.2 which is incompatible.
# mlflow 1.30.1 requires pandas<2, but you have pandas 2.1.2 which is incompatible.
# pycaret 3.1.0 requires pandas<2.0.0,>=1.3.0, but you have pandas 2.1.2 which is incompatible.
# sktime 0.21.1 requires pandas<2.1.0,>=1.1.0, but you have pandas 2.1.2 which is incompatible.
# ydata-profiling 4.6.0 requires pandas!=1.4.0,<2.1,>1.1, but you have pandas 2.1.2 which is incompatible.
# ydata-profiling 4.6.0 requires typeguard<5,>=4.1.2, but you have typeguard 2.13.3 which is incompatible.

In [102]:
import pandas as pd

print(pd.__version__)

2.1.2


In [103]:
Cross_validation = {
    "validation_type": "kfold",
    "k_folds": 5,
    "shuffle": True,
    "random_seed": 38 #24
}

In [104]:
total_time_limit = 60*60*4
automl = AutoML(mode="Compete", n_jobs = -1,total_time_limit=total_time_limit, eval_metric="mse", validation_strategy=Cross_validation)
automl.fit(train_x, train_y)

Linear algorithm was disabled.
AutoML directory: AutoML_5
The task is regression with evaluation metric mse
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree mse 505562.121325 trained in 1.39 seconds
2_DecisionTree mse 416202.886068 trained in 1.32 seconds
3_DecisionTree mse 416202.886068 trained in 1.39 seconds
* Step default_algorithms will try to check up to 7 models
4_Default_LightGBM mse 169879.004731 trained in 6.86 seconds
5_Default_Xgboost mse 208797.200515 trained in 8.39 seconds
There was 



None 10
Add Golden Feature: item_corp_loc_encod_ratio_supply
Add Golden Feature: item_encod_ratio_supply
Add Golden Feature: item_loc_encod_ratio_supply
Add Golden Feature: y_it_loc_m_encod_ratio_supply
Add Golden Feature: supply_ratio_item_encod
Add Golden Feature: supply_ratio_y_it_loc_m_encod
Add Golden Feature: supply_ratio_item_corp_loc_encod
Add Golden Feature: supply_ratio_item_loc_encod
Add Golden Feature: supply_ratio_cos_week
Add Golden Feature: supply_ratio_cos_month
Created 10 Golden Features in 3.38 seconds.
20_LightGBM_GoldenFeatures mse 178211.165592 trained in 10.39 seconds
4_Default_LightGBM_GoldenFeatures mse 177707.9508 trained in 6.84 seconds
21_LightGBM_GoldenFeatures mse 176986.845792 trained in 6.35 seconds
* Step kmeans_features will try to check up to 3 models
20_LightGBM_KMeansFeatures mse 173655.910581 trained in 12.58 seconds
4_Default_LightGBM_KMeansFeatures mse 172473.560799 trained in 12.41 seconds
21_LightGBM_KMeansFeatures mse 172271.693253 trained in 1



20_LightGBM_RandomFeature mse 175199.062707 trained in 24.64 seconds
Drop features ['season_encod', 'random_feature', 'sin_month', 'cos_month']
* Step features_selection will try to check up to 5 models
20_LightGBM_SelectedFeatures mse 168180.609345 trained in 8.46 seconds
17_Xgboost_SelectedFeatures mse 196739.934977 trained in 18.3 seconds
40_RandomForest_SelectedFeatures mse 256362.997748 trained in 6.74 seconds
59_NeuralNetwork_SelectedFeatures mse 641611.803349 trained in 32.01 seconds
50_ExtraTrees_SelectedFeatures mse 798971.401699 trained in 4.57 seconds
* Step hill_climbing_1 will try to check up to 27 models
63_LightGBM_SelectedFeatures mse 168158.156343 trained in 9.27 seconds
64_LightGBM_SelectedFeatures mse 169841.914583 trained in 8.24 seconds
65_LightGBM mse 168371.896714 trained in 9.73 seconds
66_LightGBM mse 171702.884686 trained in 7.96 seconds
67_LightGBM mse 170772.76234 trained in 10.74 seconds
68_LightGBM mse 170471.203322 trained in 10.1 seconds
69_Xgboost_Selec

In [105]:
test_x =test_x.reindex(columns=train_x.columns)

In [106]:
pred = automl.predict(test_x)

In [107]:
pred

array([2690.12524875, 2397.60277252, 2657.69027763, ...,  572.18713662,
        570.15458834,  564.0366915 ])

In [15]:

#Validation : MAE: 729715.7680863414,  Best score: 856.2573364295463,  Best trial: {'lambda': 0.06858310598122817, 'alpha': 0.7713413681986561, 'colsample_bytree': 0.9442346449671033, 'subsample': 0.9750961748574901, 'learning_rate': 0.076433115411846, 'n_estimators': 446, 'max_depth': 8, 'min_child_weight': 16}





model_xgb = RandomForestRegressor(**study.best_trial.params)
kf = KFold(n_splits=5, shuffle=True , random_state=42)

ensemble_predicts= []
scores =[]


for train_idx, val_idx in tqdm(kf.split(train_x), total=5, desc="Processing folds"):
    X_t, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_t, y_val = train_y[train_idx], train_y[val_idx]
    
    # 두 모델 모두 학습
    model_xgb.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = model_xgb.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_squared_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    model_xgb_pred = model_xgb.predict(test_x)
    model_xgb_pred = np.where(model_xgb_pred < 0, 0, model_xgb_pred)
    
    ensemble_predicts.append(model_xgb_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predicts, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))


TypeError: RandomForestRegressor.__init__() got an unexpected keyword argument 'lambda'

In [16]:


#Validation : MAE: 729715.7680863414,  Best score: 856.2573364295463,  Best trial: {'lambda': 0.06858310598122817, 'alpha': 0.7713413681986561, 'colsample_bytree': 0.9442346449671033, 'subsample': 0.9750961748574901, 'learning_rate': 0.076433115411846, 'n_estimators': 446, 'max_depth': 8, 'min_child_weight': 16}

#encod_col_v1 : Validation : MAE: 722972.6923, Best score: 864.4005832556281, Best trial: {'lambda': 0.07246551885960752, 'alpha': 0.5233846374744797, 'colsample_bytree': 0.9008060223667523, 'subsample': 0.9996254959269878, 'learning_rate': 0.061161155234371184, 'n_estimators': 753, 'max_depth': 8, 'min_child_weight': 23}


model_xgb = XGBRegressor(**study.best_trial.params)
kf = KFold(n_splits=5, shuffle=True , random_state=42)

ensemble_predicts= []
scores =[]


for train_idx, val_idx in tqdm(kf.split(train_x), total=5, desc="Processing folds"):
    X_t, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_t, y_val = train_y[train_idx], train_y[val_idx]
    
    # 두 모델 모두 학습
    model_xgb.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = model_xgb.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(mean_squared_error(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    model_xgb_pred = model_xgb.predict(test_x)
    model_xgb_pred = np.where(model_xgb_pred < 0, 0, model_xgb_pred)
    
    ensemble_predicts.append(model_xgb_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predicts, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))


Processing folds: 100%|██████████| 5/5 [00:33<00:00,  6.63s/it]

Validation : MAE scores for each fold: [2.7672175331040094, 2.6085926305807012, 2.600025852176349, 2.737986521970589, 2.4819060995110966]
Validation : MAE: 2.639145727468549





## Inference

In [108]:
final_predictions = np.round(pred, 2)
final_predictions

array([2690.13, 2397.6 , 2657.69, ...,  572.19,  570.15,  564.04])

## Submission

In [109]:
submission = pd.read_csv('../../DATA/sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [115]:
submission['answer'] = final_predictions
submission.loc[submission.index % 7 == 1, 'answer'] = 0

In [117]:
submission.loc[submission['answer'] <= 0, 'answer'] = 0


In [118]:
submission.to_csv('../../DATA/SUBMIT/new_automl_sub.csv', index=False)


In [38]:
sub = pd.read_csv('../../DATA/SUBMIT/automl_sub.csv')
sub.head(20)

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3414.06
1,TG_A_J_20230305,7.18
2,TG_A_J_20230306,3680.06
3,TG_A_J_20230307,3660.65
4,TG_A_J_20230308,3691.81
5,TG_A_J_20230309,3720.74
6,TG_A_J_20230310,3838.66
7,TG_A_J_20230311,3436.29
8,TG_A_J_20230312,5.01
9,TG_A_J_20230313,3662.43


In [40]:
sub.loc[sub.index % 7 == 1, 'answer'] = 0


Unnamed: 0,ID,answer
0,TG_A_J_20230304,3414.06
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,3680.06
3,TG_A_J_20230307,3660.65
4,TG_A_J_20230308,3691.81
5,TG_A_J_20230309,3720.74
6,TG_A_J_20230310,3838.66
7,TG_A_J_20230311,3436.29
8,TG_A_J_20230312,0.0
9,TG_A_J_20230313,3662.43


In [41]:
sub.to_csv('../../DATA/SUBMIT/sundayzero_automl_sub.csv', index=False)