In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install sktime optuna

In [None]:
import sys
import sktime
import tqdm as tq
import xgboost as xgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
import optuna
from sklearn.metrics import mean_squared_error

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import holidays

pd.set_option('display.max_columns', 30)

import warnings
warnings.filterwarnings('ignore')

In [None]:
path = '/content/drive/MyDrive/DACON/jeju_price/'

In [None]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

## 변수들을 영문명으로 변경
cols = ['id', 'date_time', 'item', 'corporation', 'location', 'supply', 'price']
train.columns = cols
cols = ['id', 'date_time', 'item', 'corporation', 'location']
test.columns = cols


def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

def holiday(df):
    kr_holidays = holidays.KR()
    df['holiday'] = df.date_time.apply(lambda x: 'holiday' if x in kr_holidays else 'non-holiday')
    return df['holiday']

def cyclical_feature(df, time=12):
    df['sin_time'] = np.sin(2*np.pi*df.month/time)
    df['cos_time'] = np.cos(2*np.pi*df.month/time)

def post_preprocessing(test, submission):
    idx_list = test[(test['Weekday'] == 6)].index
    submission.loc[idx_list, 'answer'] = 0 # Weekday == 6 (일요일)이면 가격 0원
    submission['answer'] = submission['answer'].apply(lambda x: max(0, x)) # 가격에 음수가 있다면 가격 0원으로 변경
    return submission



# 날짜를 기반으로 주 수확 시기인지를 판단하는 함수를 정의합니다.
def determine_harvest_weight(item, month):
    harvest_times = {
    'TG': {'main': [(10, 1)]},  # 감귤: 10월부터 이듬해 1월까지
    'BC': {'main': [(4, 6), (9, 11)]},  # 브로콜리: 4월-6월, 9월-11월
    'RD': {'main': [(5, 6), (11, 12)]},  # 무: 5월, 11월
    'CR': {'main': [(7, 8), (10, 11)]},  # 당근: 7월-8월, 10월-12월
    'CB': {'main': [(6, 6), (11, 11)]}  # 양배추: 6월, 11월
}
    main_harvest = harvest_times[item]['main']
    for start, end in main_harvest:
        if start <= month <= end:
            return 1
    return 0

class DataPreprocessing:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    @staticmethod
    def label_encode(train, test):
        categorical_col = ['item', 'corporation', 'location', 'season', 'holiday',
                           'item_month_Weekday', 'item_corp_Weekday', 'item_location_Weekday', 'item_year_season', 'item_weight']

        for i in categorical_col:
            le = LabelEncoder()
            train[i] = le.fit_transform(train[i])
            test[i] = le.transform(test[i])

        return train, test

    @staticmethod
    def remove_outliers(train):
        print('Remove outliers')
        train.loc[(train['Weekday'] == 6) & (train['price'] >= 0), 'price'] = 0
        return train

    @staticmethod
    def preprocessing(data):
        print('Preprocessing Start')
        # time feature
        data['year'] = data['date_time'].apply(lambda x: int(x[0:4]))
        data['month'] = data['date_time'].apply(lambda x: int(x[5:7]))
        data['Weekday'] = pd.to_datetime(data['date_time']).dt.weekday
        data['is_weekend'] = data['Weekday'].apply(lambda x: 1 if x >= 6 else 0)
        data['year'] = data['year'] - 2019
        data['season'] = group_season(data)
        data['holiday'] = holiday(data)
        cyclical_feature(data)

        # item feature
        data['total_item_value'] = data['item']+ '_' + data['corporation']+ '_' +data['location']
        data['item_month_Weekday'] = data['item'].astype(str) + "_" + data['month'].astype(str) + data['Weekday'].astype(str)
        data['item_corp_Weekday'] = data['item'].astype(str) + "_" + data['corporation'].astype(str) + data['Weekday'].astype(str)
        data['item_location_Weekday'] = data['item'].astype(str) + "_" + data['location'].astype(str) + data['Weekday'].astype(str)
        data['item_year_season'] = data['item'].astype(str) + "_" + data['year'].astype(str) + "_" + data['season'].astype(str)


        data['date_time'] = pd.to_datetime(data['date_time'])
        data['days_in_month'] = data['date_time'].dt.days_in_month
        data['is_month_start'] = data['date_time'].dt.is_month_start
        data['is_month_end'] = data['date_time'].dt.is_month_end
        data['is_quarter_start'] = data['date_time'].dt.is_quarter_start
        data['is_quarter_end'] = data['date_time'].dt.is_quarter_end
        data['is_year_start'] = data['date_time'].dt.is_year_start
        data['is_year_end'] = data['date_time'].dt.is_year_end
        data['is_leap_year'] = data['date_time'].dt.is_leap_year
        data['dayofyear'] = data['date_time'].dt.dayofyear
        data['harvest_weight'] = data.apply(lambda row: determine_harvest_weight(row['item'], row['date_time'].month), axis=1)
        # data['date_time'] = data['date_time'].view('int64') * 1e9

        data['item_weight'] = data['item'].astype(str) + "_" + data['harvest_weight'].astype(str)
        return data

    def fit(self):
        self.train = self.preprocessing(self.train)
        self.test = self.preprocessing(self.test)

        self.train = self.remove_outliers(self.train)

        # x_train = self.train.drop(columns=['ID', 'supply(kg)', 'price'])
        # y_train = self.train['price']
        # x_test = self.test.drop(columns=['ID'])

        self.train, self.test = self.label_encode(self.train, self.test)

        return train, test

preprocessing = DataPreprocessing(train, test)
train, test = preprocessing.fit()

cat_mean_col = ['total_item_value', 'item_month_Weekday', 'item_corp_Weekday', 'item_location_Weekday', 'item_year_season']
for cat_col in cat_mean_col:
    mean_value = pd.pivot_table(train, values = 'price', index = [f'{cat_col}'], aggfunc = np.mean).reset_index()
    tqdm.pandas()
    train[f'{cat_col}_mean'] = train.progress_apply(lambda x : mean_value.loc[(mean_value[f'{cat_col}'] == x[f'{cat_col}']),'price'].values[0], axis = 1)
    tqdm.pandas()
    test[f'{cat_col}_mean'] = test.progress_apply(lambda x : mean_value.loc[(mean_value[f'{cat_col}'] == x[f'{cat_col}']) ,'price'].values[0], axis = 1)

    std_value = pd.pivot_table(train, values = 'price', index = [f'{cat_col}'], aggfunc = np.std).reset_index()
    tqdm.pandas()
    train[f'{cat_col}_std'] = train.progress_apply(lambda x : std_value.loc[(std_value[f'{cat_col}'] == x[f'{cat_col}']),'price'].values[0], axis = 1)
    tqdm.pandas()
    test[f'{cat_col}_std'] = test.progress_apply(lambda x : std_value.loc[(std_value[f'{cat_col}'] == x[f'{cat_col}']) ,'price'].values[0], axis = 1)

train.head()

In [None]:
train['week_of_year'] = train['date_time'].dt.isocalendar().week
test['week_of_year'] = test['date_time'].dt.isocalendar().week

train['week_item'] = train['week_of_year'].astype(str) + '_' + train['item'].astype(str)
test['week_item'] = test['week_of_year'].astype(str) + '_' + test['item'].astype(str)
filtered_data = train[train['price'] > 0]
mean_encoded = filtered_data.groupby('week_item')['price'].mean().reset_index()
mean_encoded.rename(columns={'price': 'mean_encoded_price'}, inplace=True)
train = pd.merge(train, mean_encoded, on='week_item', how='left')
test = pd.merge(test, mean_encoded, on='week_item', how='left')

train['week_item_location'] = train['week_of_year'].astype(str) + '_' + train['item'].astype(str) + '_' + train['location'].astype(str)
test['week_item_location'] = test['week_of_year'].astype(str) + '_' + test['item'].astype(str) + '_' + test['location'].astype(str)
filtered_data = train[train['price'] > 0]
mean_encoded_mwic = filtered_data.groupby('week_item_location')['price'].mean().reset_index()
mean_encoded_mwic.rename(columns={'price': 'mean_encoded_price_mwil'}, inplace=True)
train = pd.merge(train, mean_encoded_mwic, on='week_item_location', how='left')
test = pd.merge(test, mean_encoded_mwic, on='week_item_location', how='left')

train['week_item_corporation'] = train['week_of_year'].astype(str) + '_' + train['item'].astype(str) + '_' + train['corporation'].astype(str)
test['week_item_corporation'] = test['week_of_year'].astype(str) + '_' + test['item'].astype(str) + '_' + test['corporation'].astype(str)
filtered_data_mwic = train[train['price'] > 0]
mean_encoded_mwic = filtered_data_mwic.groupby('week_item_corporation')['price'].mean().reset_index()
mean_encoded_mwic.rename(columns={'price': 'mean_encoded_price_mwic'}, inplace=True)
train = pd.merge(train, mean_encoded_mwic, on='week_item_corporation', how='left')
test = pd.merge(test, mean_encoded_mwic, on='week_item_corporation', how='left')

train['week_item_corporation_location'] = train['week_of_year'].astype(str) + '_' + train['item'].astype(str) + '_' + train['corporation'].astype(str) + '_' + train['location'].astype(str)
test['week_item_corporation_location'] = test['week_of_year'].astype(str) + '_' + test['item'].astype(str) + '_' + test['corporation'].astype(str) + '_' + test['location'].astype(str)
filtered_data_mwicl = train[train['price'] > 0]
mean_encoded_mwicl = filtered_data_mwicl.groupby('week_item_corporation_location')['price'].mean().reset_index()
mean_encoded_mwicl.rename(columns={'price': 'mean_encoded_price_mwicl'}, inplace=True)
train = pd.merge(train, mean_encoded_mwicl, on='week_item_corporation_location', how='left')
test = pd.merge(test, mean_encoded_mwicl, on='week_item_corporation_location', how='left')


# train['original_index'] = train.index
# test['original_index'] = test.index

# train['item_id'] = train['item'].astype(str) + '_' + train['corporation'].astype(str) + '_' + train['location'].astype(str)
# test['item_id'] = test['item'].astype(str) + '_' + test['corporation'].astype(str) + '_' + test['location'].astype(str)
# weekly_avg = train.groupby(['item_id', 'week_of_year'])['mean_encoded_price_mwicl'].mean().reset_index()
# weekly_avg['weekly_difference'] = weekly_avg.groupby('item_id')['mean_encoded_price_mwicl'].diff()
# weekly_avg = weekly_avg[['item_id', 'week_of_year', 'weekly_difference']]
# train = pd.merge(train, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# test = pd.merge(test, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# train = train.sort_values(by='original_index')
# test = test.sort_values(by='original_index')


# train['item_id'] = train['item'].astype(str) + '_' + train['corporation'].astype(str)
# test['item_id'] = test['item'].astype(str) + '_' + test['corporation'].astype(str)
# weekly_avg = train.groupby(['item_id', 'week_of_year'])['mean_encoded_price_mwic'].mean().reset_index()
# weekly_avg['weekly_difference_2'] = weekly_avg.groupby('item_id')['mean_encoded_price_mwic'].diff()
# weekly_avg = weekly_avg[['item_id', 'week_of_year', 'weekly_difference_2']]
# train = pd.merge(train, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# test = pd.merge(test, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# train = train.sort_values(by='original_index')
# test = test.sort_values(by='original_index')

# train['item_id'] = train['item'].astype(str) + '_' + train['location'].astype(str)
# test['item_id'] = test['item'].astype(str) + '_' + test['location'].astype(str)
# weekly_avg = train.groupby(['item_id', 'week_of_year'])['mean_encoded_price_mwil'].mean().reset_index()
# weekly_avg['weekly_difference_3'] = weekly_avg.groupby('item_id')['mean_encoded_price_mwil'].diff()
# weekly_avg = weekly_avg[['item_id', 'week_of_year', 'weekly_difference_3']]
# train = pd.merge(train, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# test = pd.merge(test, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# train = train.sort_values(by='original_index')
# test = test.sort_values(by='original_index')

# train['item_id'] = train['item'].astype(str)
# test['item_id'] = test['item'].astype(str)
# weekly_avg = train.groupby(['item_id', 'week_of_year'])['mean_encoded_price'].mean().reset_index()
# weekly_avg['weekly_difference_4'] = weekly_avg.groupby('item_id')['mean_encoded_price'].diff()
# weekly_avg = weekly_avg[['item_id', 'week_of_year', 'weekly_difference_4']]
# train = pd.merge(train, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# test = pd.merge(test, weekly_avg, on=['item_id', 'week_of_year'], how='outer')
# train = train.sort_values(by='original_index')
# test = test.sort_values(by='original_index')

train['date_time_int'] = train['date_time'].view('int64') * 1e9
test['date_time_int'] = test['date_time'].view('int64') * 1e9

# train.drop(columns=['item_id', 'original_index', 'week_item_location', 'week_item_corporation', 'week_item_corporation_location', 'week_item', 'week_of_year'], inplace=True)
# test.drop(columns=['item_id', 'original_index',  'week_item_location', 'week_item_corporation', 'week_item_corporation_location', 'week_item', 'week_of_year'], inplace=True)

# train = train.fillna(0)
# test = test.fillna(0)


# categorical_col = ['week_of_year']

# for i in categorical_col:
#     le = LabelEncoder()
#     train[i] = le.fit_transform(train[i])
#     test[i] = le.transform(test[i])

In [None]:
train.drop(columns=['id', 'supply'], inplace=True)
train = train[['total_item_value', 'date_time', 'price', 'item', 'corporation', 'location',
       'year', 'month', 'Weekday', 'is_weekend', 'season', 'holiday',
       'sin_time', 'cos_time', 'item_month_Weekday',
       'item_corp_Weekday', 'item_location_Weekday', 'item_year_season',
       'days_in_month', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'is_leap_year',
       'dayofyear', 'harvest_weight', 'item_weight', 'total_item_value_mean',
       'total_item_value_std', 'item_month_Weekday_mean',
       'item_month_Weekday_std', 'item_corp_Weekday_mean',
       'item_corp_Weekday_std', 'item_location_Weekday_mean',
       'item_location_Weekday_std', 'item_year_season_mean',
       'item_year_season_std']]

In [None]:
test.drop(columns=['id'], inplace=True)
test = test[['total_item_value', 'date_time', 'item', 'corporation', 'location', 'year', 'month',
       'Weekday', 'is_weekend', 'season', 'holiday', 'sin_time', 'cos_time',
       'item_month_Weekday', 'item_corp_Weekday',
       'item_location_Weekday', 'item_year_season', 'days_in_month',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_year_start', 'is_year_end', 'is_leap_year', 'dayofyear',
       'harvest_weight', 'item_weight', 'total_item_value_mean',
       'total_item_value_std', 'item_month_Weekday_mean',
       'item_month_Weekday_std', 'item_corp_Weekday_mean',
       'item_corp_Weekday_std', 'item_location_Weekday_mean',
       'item_location_Weekday_std', 'item_year_season_mean',
       'item_year_season_std']]

In [None]:
preds = np.array([])
item_unique = list(train['total_item_value'].unique())
for i in tqdm(item_unique):
    pred_df = pd.DataFrame(columns=['answer'])

    y_train = train.loc[train.total_item_value == i, 'price']
    x_train, x_test = train.loc[train.total_item_value == i, ].iloc[:, 3:], test.loc[test.total_item_value == i, ].iloc[:,1:]
    x_test = x_test[x_train.columns]

    model = xgb.XGBRegressor(**params)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    pred_df.loc[:,'answer'] = y_pred

    pred = pred_df['answer']
    preds = np.append(preds, pred)

submission = pd.read_csv(path + 'sample_submission.csv')
submission['answer'] = preds
submission.to_csv(path + 'xgboost_timeseries.csv', index=False)