In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install autogluon
!pip install mxnet~=1.9



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import holidays
from autogluon.tabular import TabularDataset, TabularPredictor

import warnings
warnings.filterwarnings('ignore')

In [None]:
path = '/content/drive/MyDrive/DACON/jeju_price/'

In [None]:
train = pd.read_csv(path + 'train.csv')
international_trade = pd.read_csv(path + 'international_trade.csv')
test = pd.read_csv(path + 'test.csv')

In [None]:
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

def holiday(df):
    kr_holidays = holidays.KR()
    df['holiday'] = df.timestamp.apply(lambda x: 'holiday' if x in kr_holidays else 'non-holiday')
    return df['holiday']

def cyclical_feature(df, time=12):
    df['sin_time'] = np.sin(2*np.pi*df.month/time)
    df['cos_time'] = np.cos(2*np.pi*df.month/time)

def post_preprocessing(test, submission):
    idx_list = test[(test['Weekday'] == 6)].index
    submission.loc[idx_list, 'answer'] = 0 # Weekday == 6 (일요일)이면 가격 0원
    submission['answer'] = submission['answer'].apply(lambda x: max(0, x)) # 가격에 음수가 있다면 가격 0원으로 변경
    return submission



# 날짜를 기반으로 주 수확 시기인지를 판단하는 함수를 정의합니다.
def determine_harvest_weight(item, month):
    harvest_times = {
    'TG': {'main': [(10, 1)]},  # 감귤: 10월부터 이듬해 1월까지
    'BC': {'main': [(4, 6), (9, 11)]},  # 브로콜리: 4월-6월, 9월-11월
    'RD': {'main': [(5, 6), (11, 12)]},  # 무: 5월, 11월
    'CR': {'main': [(7, 8), (10, 11)]},  # 당근: 7월-8월, 10월-12월
    'CB': {'main': [(6, 6), (11, 11)]}  # 양배추: 6월, 11월
}
    main_harvest = harvest_times[item]['main']
    for start, end in main_harvest:
        if start <= month <= end:
            return 1
    return 0

In [None]:
class DataPreprocessing:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    @staticmethod
    def label_encode(train, test):
        categorical_col = ['item', 'corporation', 'location', 'season', 'holiday', 'total_item_value',
                           'item_month_Weekday', 'item_corp_Weekday', 'item_location_Weekday', 'item_year_season', 'item_weight']

        for i in categorical_col:
            le = LabelEncoder()
            train[i] = le.fit_transform(train[i])
            test[i] = le.transform(test[i])

        return train, test

    @staticmethod
    def remove_outliers(train):
        print('Remove outliers')
        # Q1 = train['price(원/kg)'].quantile(0.25)
        # Q3 = train['price(원/kg)'].quantile(0.75)
        # IQR = Q3 - Q1

        # lower_bound = Q1 - 1.5 * IQR
        # upper_bound = Q3 + 3 * IQR
        # train = train[(train['price(원/kg)'] >= lower_bound) & (train['price(원/kg)'] <= upper_bound)]
        train.loc[(train['Weekday'] == 6) & (train['price(원/kg)'] >= 0), 'price(원/kg)'] = 0
        return train

    @staticmethod
    def preprocessing(data):
        print('Preprocessing Start')
        # time feature
        data['year'] = data['timestamp'].apply(lambda x: int(x[0:4]))
        data['month'] = data['timestamp'].apply(lambda x: int(x[5:7]))
        data['Weekday'] = pd.to_datetime(data['timestamp']).dt.weekday
        data['is_weekend'] = data['Weekday'].apply(lambda x: 1 if x >= 6 else 0)
        data['year'] = data['year'] - 2019
        data['season'] = group_season(data)
        data['holiday'] = holiday(data)
        cyclical_feature(data)

        # item feature
        data['total_item_value'] = data['item']+data['corporation']+data['location']
        data['item_month_Weekday'] = data['item'].astype(str) + "_" + data['month'].astype(str) + data['Weekday'].astype(str)
        data['item_corp_Weekday'] = data['item'].astype(str) + "_" + data['corporation'].astype(str) + data['Weekday'].astype(str)
        data['item_location_Weekday'] = data['item'].astype(str) + "_" + data['location'].astype(str) + data['Weekday'].astype(str)
        data['item_year_season'] = data['item'].astype(str) + "_" + data['year'].astype(str) + "_" + data['season'].astype(str)


        data['timestamp'] = pd.to_datetime(data['timestamp'])
        data['harvest_weight'] = data.apply(lambda row: determine_harvest_weight(row['item'], row['timestamp'].month), axis=1)
        # data['timestamp'] = data['timestamp'].view('int64') * 1e9

        data['item_weight'] = data['item'].astype(str) + "_" + data['harvest_weight'].astype(str)
        return data

    def fit(self):
        self.train = self.preprocessing(self.train)
        self.test = self.preprocessing(self.test)

        self.train = self.remove_outliers(self.train)

        x_train = self.train.drop(columns=['ID', 'supply(kg)', 'price(원/kg)'])
        y_train = self.train['price(원/kg)']
        x_test = self.test.drop(columns=['ID'])

        x_train, x_test = self.label_encode(x_train, x_test)

        return x_train, y_train, x_test

In [None]:
preprocessing = DataPreprocessing(train, test)
x, y, test = preprocessing.fit()
train_set = pd.concat([x, y], axis=1)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1103)
train_autogluon = pd.concat([x, y], axis=1)

cat_mean_col = ['total_item_value', 'item_month_Weekday', 'item_corp_Weekday', 'item_location_Weekday', 'item_year_season']
for cat_col in cat_mean_col:
    mean_value = pd.pivot_table(train_autogluon, values = 'price(원/kg)', index = [f'{cat_col}'], aggfunc = np.mean).reset_index()
    tqdm.pandas()
    train_autogluon[f'{cat_col}_mean'] = train_autogluon.progress_apply(lambda x : mean_value.loc[(mean_value[f'{cat_col}'] == x[f'{cat_col}']),'price(원/kg)'].values[0], axis = 1)
    tqdm.pandas()
    test[f'{cat_col}_mean'] = test.progress_apply(lambda x : mean_value.loc[(mean_value[f'{cat_col}'] == x[f'{cat_col}']) ,'price(원/kg)'].values[0], axis = 1)

    std_value = pd.pivot_table(train_autogluon, values = 'price(원/kg)', index = [f'{cat_col}'], aggfunc = np.std).reset_index()
    tqdm.pandas()
    train_autogluon[f'{cat_col}_std'] = train_autogluon.progress_apply(lambda x : std_value.loc[(std_value[f'{cat_col}'] == x[f'{cat_col}']),'price(원/kg)'].values[0], axis = 1)
    tqdm.pandas()
    test[f'{cat_col}_std'] = test.progress_apply(lambda x : std_value.loc[(std_value[f'{cat_col}'] == x[f'{cat_col}']) ,'price(원/kg)'].values[0], axis = 1)

train_autogluon['timestamp'] = train_autogluon['timestamp'].view('int64') * 1e9
test['timestamp'] = test['timestamp'].view('int64') * 1e9

Preprocessing Start
Preprocessing Start
Remove outliers


In [None]:
hyperparameters = {
    'GBM': [
        {'device': 'gpu', 'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
        {'device': 'gpu'},
        'GBMLarge'
    ],
    'CAT': {'task_type': 'GPU'},
    'XGB': {'tree_method': 'gpu_hist'}
}

In [None]:
label = 'price(원/kg)'
predictor = TabularPredictor(label=label, path=path + 'auto_result').fit(train_autogluon,
                            ag_args_fit={'num_gpus': 1},
                            hyperparameters=hyperparameters,
                            time_limit=3600, presets='best_quality')

In [None]:
predictions = predictor.predict(test)

In [None]:
predictions

0       2660.687988
1       -248.303879
2       4367.434082
3       3791.107178
4       3316.834961
           ...     
1087     569.025024
1088     514.480469
1089     453.412231
1090     462.447937
1091     488.345642
Name: price(원/kg), Length: 1092, dtype: float32

In [None]:
submission = pd.read_csv(path + 'sample_submission.csv')
submission['answer'] = np.round(predictions)
submission = post_preprocessing(test, submission)
submission.to_csv(path + 'auto_submission_feature4.csv', index=False)