- with classification
    - classification으로 1,0 구분하는거랑 regression 둘다 만들어두고,
    - classification=1인 경우에 regression 결과값, classification=0인 경우에 0 넣는 방법

# Library Setting

In [1]:
# import torch
# from tabpfn import TabPFNClassifier

In [None]:
import sys
sys.path.append('/Volumes/KHJ/Github/hyuckjinkim/lib-python')

from base import gc_collect_all, setdiff
from filesystem_utils import mkdir
from graph import abline
from data_prepare import (
    get_holiday, reduce_mem_usage, delete_unique_columns,
    TypeController, CategoricalQuantileCalculator,
    GroupScaler, OneHotEncoder, InteractionTerm, TargetTransform,
)

In [None]:
gc_collect_all()

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', None)

import datetime

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

In [None]:
def return_unique_columns(data):
    unique_info = data.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    return unique_cols

In [None]:
class CFG:
    SEED = 42
    TARGET = 'price(원/kg)'
    KEY_FEATURES = ['item','corporation','location']

<br></br>

# Data

## Data Load

In [None]:
# item:
#     TG : 감귤
#     BC : 브로콜리
#     RD : 무
#     CR : 당근
#     CB : 양배추

In [None]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')
trade_df = pd.read_csv('./data/international_trade.csv')

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
print(test_df.shape)
test_df.head()

In [None]:
print(trade_df.shape)
trade_df.head()

<br>

## Preprocessing

In [None]:
def add_week(weekend):
    week = 1
    week_list = []
    for weekend in weekend:
        if weekend==1:
            week+=1
        week_list.append(week)
    return week_list

In [None]:
def preprocessing(data):
    d = data.copy()
    
    d = d.sort_values(CFG.KEY_FEATURES).reset_index(drop=True)
    
    # (1) datetime format
    d['timestamp'] = d['timestamp'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
    
    # (2) delete unuse features
    unuse_features = ['ID','supply(kg)']
    unuse_features = list(set(d.columns)&set(unuse_features))
    d.drop(unuse_features,axis=1,inplace=True)

    return d

In [None]:
def merge_trade_data(data,trade_data):
    d, td = data.copy(), trade_data.copy()
    
    ### trade data preprocessing
    
    # # (1) 흑자여부
    td.drop('무역수지',axis=1,inplace=True)

    # (2) 품목명 수정 및 필요없는 품목 정보 제거
    replace_dict = {
        '감귤': 'TG',
        '꽃양배추와 브로콜리(broccoli)': 'BC',
        '당근': 'CR',
        '순무': 'RD',
        '양배추': 'CB',
    }
    td['품목명'] = td['품목명'].map(replace_dict)
    td.dropna(subset=['품목명'], inplace=True)
    
    ### merge data
    d['기간'] = [str(t)[:7] for t in d['timestamp']]
    merged = pd.merge(d,td.rename(columns={'품목명':'item'}),how='left',on=['기간','item'])
    merged.drop('기간',axis=1,inplace=True)
    
    # 2023년 3월 데이터는 trade 데이터에 없으므로 가장 최근의 값으로 채워줌
    merged.ffill(inplace=True)
    
    return merged

In [None]:
def feature_engineering(data):
    d = data.copy()
    
    # (1) date columns
    d['year']       = d.timestamp.dt.year
    d['month']      = d.timestamp.dt.month
    d['day']        = d.timestamp.dt.day
    d['weekday']    = d.timestamp.dt.weekday
    d['weekend']    = d.timestamp.dt.weekday.isin([5,6]).astype(int)
    d['week']       = add_week(d['weekend'])

    # (2) is holiday & is dayoff
    holiday_list = get_holiday(d['year'].unique())
    d['is_holiday'] = d.timestamp.isin(holiday_list).astype(int)
    d['is_dayoff']  = ((d.is_holiday==1) | (d.weekend==1)).astype(int)
    
    # (3) 무역수지, 흑자여부 추가
    d['무역수지'] = d['수출 금액'] - d['수입 금액']
    d['흑자여부'] = np.where(d['무역수지']>0,1,0)
    
    return d

In [None]:
# from copy import deepcopy
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import SimpleImputer, IterativeImputer

# def imputation(train_data,test_data,method):
#     if method in ['mean','median']:
#         base_imputer = SimpleImputer(missing_values=np.nan, strategy=method)
#     elif method=='mice':
#         base_imputer = IterativeImputer(random_state=CFG.SEED)
#     else:
#         raise ValueError(f"Unknown method={method}")
    
#     numerical_cols = train_data.select_dtypes(include=[int,float]).columns.tolist()
#     numerical_cols = list(set(numerical_cols)-set([CFG.TARGET]))
    
#     train_list = []
#     test_list = []
#     for (item,corporation,location),tr in tqdm(train_data.groupby(['item','corporation','location'])):
#         te = test_data[(test_data['item']==item) & (test_data['corporation']==corporation) & (test_data['location']==location)]
#         imputer = deepcopy(base_imputer)
#         tr[numerical_cols] = imputer.fit_transform(tr[numerical_cols])
#         train_list.append(tr)
#         if len(te)>0:
#             te[numerical_cols] = imputer.transform(te[numerical_cols])
#             test_list.append(te)
        
#     new_train = pd.concat(train_list,axis=0)
#     new_test = pd.concat(test_list,axis=0)
    
#     return new_train.sort_index(), new_test.sort_index()

In [None]:
train_df = preprocessing(train_df)
train_df = merge_trade_data(train_df, trade_df)
train_df = feature_engineering(train_df)

test_df = preprocessing(test_df)
test_df = merge_trade_data(test_df, trade_df)
test_df = feature_engineering(test_df)

In [None]:
train_df.head()

<br></br>

# EDA

In [None]:
# (1) key별 건수
train_df.groupby(CFG.KEY_FEATURES).size().head()

In [None]:
# (2) 품목/법인/지역에 따른 가격 히스토그램
# train_df.groupby(CFG.KEY_FEATURES)[CFG.TARGET].agg(Min=np.min,Max=np.max)
for i,(idx,d) in enumerate(train_df.groupby(CFG.KEY_FEATURES)):
    if i<3:
        target = d[CFG.TARGET]
        target = target[target!=0]
        
        plt.figure(figsize=(15,7))
        sns.histplot(target)
        plt.grid()
        plt.show()

In [None]:
# (3) 품목/법인/지역에 따른 일별 가격변동
grp_data = train_df.groupby(CFG.KEY_FEATURES)
for i,(idx,d) in enumerate(grp_data):
    if i<3:
        plt.figure(figsize=(15,7))
        sns.lineplot(x=d['timestamp'],y=d[CFG.TARGET],alpha=0.7)
        if np.where(d[CFG.TARGET]==0,1,0).sum()>0:
            x=d['timestamp'][d[CFG.TARGET]==0]
            y=d[CFG.TARGET] [d[CFG.TARGET]==0]
            sns.scatterplot(x=x,y=y,color='red',alpha=0.7)
        plt.xticks(rotation=90)
        plt.grid()
        plt.title(f'[{i+1}/{len(grp_data)}] {idx}')
        plt.show()

In [None]:
for item,d in train_df.groupby('item'):
    plt.figure(figsize=(15,7))
    sns.scatterplot(x=d['timestamp'],y=d[CFG.TARGET],hue=d['corporation']+d['location'])
    plt.grid()
    plt.show()

<br></br>

# Modeling

In [None]:
mkdir('./mc')

In [None]:
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from catboost import CatBoostClassifier, Pool
from prophet import Prophet
import optuna

In [None]:
import logging
logging.getLogger("prophet").setLevel(logging.WARNING)
logging.getLogger("cmdstanpy").disabled=True

In [None]:
def get_seg_data(data,item,corporation,location,prophet_data):
    d = data.copy()
    d = d[(d['item']==item) & (d['corporation']==corporation) & (d['location']==location)]
    if prophet_data:
        if CFG.TARGET in d.columns:
            d = d[['timestamp','is_holiday',CFG.TARGET]].rename(columns={'timestamp':'ds',CFG.TARGET:'y'})
        else:
            d = d[['timestamp','is_holiday']].rename(columns={'timestamp':'ds'})
        #d['cap'] = d[CFG.TARGET].max()
    else:
        d.drop(['item','corporation','location'],axis=1,inplace=True)
    return d

In [None]:
def get_holidays(train_data,test_data):
    columns = ['ds','is_holiday']
    data = pd.concat([train_data[columns],test_data[columns]],axis=0)
    holidays = data[data.is_holiday==1][['ds']].reset_index(drop=True)
    holidays = holidays.assign(holiday='KOR_Holidays')
    return holidays

In [None]:
train_df.head()

In [None]:
cls_model_params  = {
    'random_state' : CFG.SEED,
    'iterations' : 30000,
    'early_stopping_rounds' : 300,
    'learning_rate' : 0.01,
    'eval_metric' : 'F1',
    'grow_policy' : 'Lossguide',
    'use_best_model' : True,
    'allow_writing_files' : False,
    'verbose' : 0,
    'max_depth' : 9,
    'l2_leaf_reg' : 1,
}

In [None]:
# (1) time series model
tr_data = train_df[['timestamp','is_holiday',CFG.TARGET]].rename(columns={'timestamp':'ds',CFG.TARGET:'y'})
te_data = test_df [['timestamp','is_holiday']].rename(columns={'timestamp':'ds',CFG.TARGET:'y'})
holidays = get_holidays(tr_data,te_data)

print('Prophet Fitting Start...')
ts_model = Prophet(
    # growth='linear',
    # seasonality_mode='additive', # 'multiplicative'
    # yearly_seasonality=True,
    # weekly_seasonality=True,
    # daily_seasonality=False,
    holidays=holidays,
)
ts_model.add_country_holidays(country_name='KOR')
ts_model.fit(tr_data.drop('is_holiday',axis=1))
pred = ts_model.predict(te_data[['ds']])['yhat']
print('Prophet Fitting End...')

# (2) classification model
tr_data = train_df.copy()
te_data = test_df .copy()

y = tr_data[CFG.TARGET]
X      = tr_data.drop([CFG.TARGET,'timestamp'],axis=1)
X_test = te_data.drop('timestamp',axis=1)

is_y_zero = np.where(y==0,1,0)
X_train, X_val, y_train, y_val = train_test_split(X,is_y_zero,
                                                  test_size=0.2,shuffle=True,random_state=CFG.SEED,
                                                  stratify=is_y_zero)
train_dataset = Pool(X_train,y_train,cat_features=CFG.KEY_FEATURES)
val_dataset   = Pool(X_val  ,y_val  ,cat_features=CFG.KEY_FEATURES)
test_dataset  = Pool(X_test,cat_features=CFG.KEY_FEATURES)

print('CatBoostClassifier Fitting Start...')
cls_model = CatBoostClassifier(**cls_model_params)
cls_model.fit(train_dataset,eval_set=val_dataset)
print('CatBoostClassifier Fitting End...')

f1 = f1_score(cls_model.predict(val_dataset),y_val)
acc = accuracy_score(cls_model.predict(val_dataset),y_val)
print('f1: {:.3f}, accuracy: {:.3f}'.format(f1,acc))

is_zero = cls_model.predict(test_dataset)
pred_fn = [0 if z==1 else p for p,z in zip(pred,is_zero)]

plot_df = pd.DataFrame({
    'ds' : tr_data['timestamp'].tolist() + te_data['timestamp'].tolist(),
    'y' : tr_data[CFG.TARGET].tolist() + pred_fn,
    'group' : ['train']*len(tr_data) + ['test']*len(pred_fn),
},index=tr_data.index.tolist() + te_data.index.tolist())

In [None]:
plot_df[plot_df['group']=='test'].head()

In [None]:
# ts_models = []
# cls_models = []
# plot_data = []
# i=0

# grps = train_df[CFG.KEY_FEATURES].drop_duplicates().values
# pbar = tqdm(grps)
# for item,corp,loc in pbar:
#     i+=1
#     str_i = str(i).zfill(len(str(len(grps))))
#     progress = '[{}/{}] item:{}, corporation:{}, location:{}'.format(str_i,len(grps),item,corp,loc)
#     pbar.set_description(progress)
    
#     # (1) time series model
#     tr_data = get_seg_data(train_df,item,corp,loc,True)
#     te_data = get_seg_data(test_df ,item,corp,loc,True)
    
#     holidays = get_holidays(tr_data,te_data)
#     model = Prophet(
#         # growth='linear',
#         # seasonality_mode='additive', # 'multiplicative'
#         # yearly_seasonality=True,
#         # weekly_seasonality=True,
#         # daily_seasonality=False,
#         holidays=holidays,
#     )
#     model.add_country_holidays(country_name='KOR')
#     model.fit(tr_data.drop('is_holiday',axis=1))
#     pred = model.predict(te_data[['ds']])['yhat']
#     ts_models.append(model)
    
#     # (2) classification model
#     tr_data = get_seg_data(train_df,item,corp,loc,False)
#     te_data = get_seg_data(test_df ,item,corp,loc,False)
    
#     y = tr_data[CFG.TARGET]
#     X      = tr_data.drop([CFG.TARGET,'timestamp'],axis=1)
#     X_test = te_data.drop('timestamp',axis=1)
    
#     is_y_zero = np.where(y==0,1,0)
#     X_train, X_val, y_train, y_val = train_test_split(X,is_y_zero,test_size=0.2,shuffle=True,random_state=CFG.SEED,stratify=is_y_zero)
#     train_dataset = Pool(X_train,y_train,cat_features=cat_features)
#     val_dataset   = Pool(X_val  ,y_val  ,cat_features=cat_features)
#     test_dataset  = Pool(X_test,cat_features=cat_features)
#     cls_model = CatBoostClassifier(**cls_model_params)
#     cls_model.fit(train_dataset,eval_set=val_dataset)
#     cls_models.append(cls_model)
    
#     f1 = f1_score(cls_model.predict(val_dataset),y_val)
#     acc = accuracy_score(cls_model.predict(val_dataset),y_val)
#     print('f1: {:.3f}, accuracy: {:.3f}'.format(f1,acc))
    
#     is_zero = cls_model.predict(test_dataset)
#     pred_fn = [0 if z==1 else p for p,z in zip(pred,is_zero)]
    
#     plot_df = pd.DataFrame({
#         'ds' : tr_data['timestamp'].tolist() + te_data['timestamp'].tolist(),
#         'y' : tr_data[CFG.TARGET].tolist() + pred_fn,
#         'group' : ['train']*len(tr_data) + ['test']*len(pred_fn),
#     },index=tr_data.index.tolist() + te_data.index.tolist())
#     plot_data.append(plot_df)
    
#     plt.figure(figsize=(15,7))
#     sns.lineplot(x=plot_df.ds,y=plot_df.y,hue=plot_df.group)
#     sns.scatterplot(x=plot_df.ds,y=plot_df.y,hue=plot_df.group,legend=False)
#     plt.grid()
#     plt.show()

In [None]:
# for model in cls_models:
#     tmp = pd.DataFrame({
#         'feature' : X.columns,
#         'imp' : model.feature_importances_,
#     }).sort_values('imp',ascending=False)
#     display(tmp.head(2))

In [None]:
tmp = pd.concat(plot_data,axis=0)
tmp = tmp[tmp['group']=='test']
tmp = tmp.sort_index()
tmp.head()

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['answer'] = tmp['y']
submit.to_csv('./out/submit_seg_prophet_with_clsmodel_1.csv',index=False)
submit.head()