In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool, cv
import catboost
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import log_loss


ModuleNotFoundError: No module named 'lightgbm'

In [2]:
train = pd.read_csv('jejuData/train.csv')
test = pd.read_csv('jejuData/test.csv')
trade = pd.read_csv('jejuData/international_trade.csv')

In [3]:
train.head()   #날짜데이터 2019-01-01 ~ 2023-03-03 

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


In [4]:
# 'date'를 제외한 컬럼 저장하기
# 'timestamp' 컬럼을 datetime 타입으로 변경
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

train.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


In [5]:
# 날짜 관련 feature 추가 하기

train['year'] = train['timestamp'].dt.year
train['month'] = train['timestamp'].dt.month
train['week'] = train['timestamp'].dt.isocalendar().week.astype(np.int32)
train['weekday']  = train['timestamp'].dt.weekday

# 날짜 관련 피처를 저장해 둔다.
features_date = ['month', 'week', 'weekday']

In [6]:
# 컬럼명 변경
train = train.rename(columns = {'price(원/kg)' : 'price', 'supply(kg)' : 'supply'})
train

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3


In [7]:
items = train['item'].value_counts().index.to_list()
items

['TG', 'BC', 'RD', 'CR', 'CB']

In [8]:
corp = train['corporation'].value_counts().index.to_list()
corp

['A', 'E', 'D', 'C', 'B', 'F']

In [9]:
train[train['supply']==0]

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2
5,TG_A_J_20190106,2019-01-06,TG,A,J,0.0,0.0,2019,1,1,6
12,TG_A_J_20190113,2019-01-13,TG,A,J,0.0,0.0,2019,1,2,6
19,TG_A_J_20190120,2019-01-20,TG,A,J,0.0,0.0,2019,1,3,6
...,...,...,...,...,...,...,...,...,...,...,...
59363,RD_F_J_20230129,2023-01-29,RD,F,J,0.0,0.0,2023,1,4,6
59370,RD_F_J_20230205,2023-02-05,RD,F,J,0.0,0.0,2023,2,5,6
59377,RD_F_J_20230212,2023-02-12,RD,F,J,0.0,0.0,2023,2,6,6
59384,RD_F_J_20230219,2023-02-19,RD,F,J,0.0,0.0,2023,2,7,6


In [10]:
not_zero = train[train['supply']!=0]
not_zero

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5
6,TG_A_J_20190107,2019-01-07,TG,A,J,44995.0,1474.0,2019,1,2,0
7,TG_A_J_20190108,2019-01-08,TG,A,J,26975.0,1326.0,2019,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3


In [11]:
not_zero['y-m']=not_zero['timestamp'].dt.strftime('%Y-%m')
not_zero['y-m']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_zero['y-m']=not_zero['timestamp'].dt.strftime('%Y-%m')


2        2019-01
3        2019-01
4        2019-01
6        2019-01
7        2019-01
          ...   
59392    2023-02
59393    2023-02
59394    2023-03
59395    2023-03
59396    2023-03
Name: y-m, Length: 23945, dtype: object

In [12]:
#ID 열 제거
not_zero = not_zero.drop('ID', axis = 1)
not_zero

Unnamed: 0,timestamp,item,corporation,location,supply,price,year,month,week,weekday,y-m
2,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,2019-01
3,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,2019-01
4,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,2019-01
6,2019-01-07,TG,A,J,44995.0,1474.0,2019,1,2,0,2019-01
7,2019-01-08,TG,A,J,26975.0,1326.0,2019,1,2,1,2019-01
...,...,...,...,...,...,...,...,...,...,...,...
59392,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0,2023-02
59393,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1,2023-02
59394,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2,2023-03
59395,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3,2023-03


In [13]:
train[train['supply']==0]

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2
5,TG_A_J_20190106,2019-01-06,TG,A,J,0.0,0.0,2019,1,1,6
12,TG_A_J_20190113,2019-01-13,TG,A,J,0.0,0.0,2019,1,2,6
19,TG_A_J_20190120,2019-01-20,TG,A,J,0.0,0.0,2019,1,3,6
...,...,...,...,...,...,...,...,...,...,...,...
59363,RD_F_J_20230129,2023-01-29,RD,F,J,0.0,0.0,2023,1,4,6
59370,RD_F_J_20230205,2023-02-05,RD,F,J,0.0,0.0,2023,2,5,6
59377,RD_F_J_20230212,2023-02-12,RD,F,J,0.0,0.0,2023,2,6,6
59384,RD_F_J_20230219,2023-02-19,RD,F,J,0.0,0.0,2023,2,7,6


In [15]:
from pytimekr import pytimekr

year_2019 = pytimekr.holidays(year=2019)
year_2020 = pytimekr.holidays(year=2020)
year_2021 = pytimekr.holidays(year=2021)
year_2022 = pytimekr.holidays(year=2022)
year_2023 = pytimekr.holidays(year=2023)



def holidays(x):
    if x.weekday() in range(5,8):
        return 1
    if x.year == 2019  and x in year_2019 :
        return 1
    elif x.year == 2020 and x in year_2020:
        return 1
    elif x.year == 2021 and x in year_2021 :
        return 1
    elif x.year == 2022 and x in year_2022 :
        return 1
    elif x.year == 2023 and x in year_2023:
        return 1
    else:
        return 0

In [16]:
import warnings
warnings.filterwarnings('ignore')
train['holiday'] = train['timestamp'].apply(holidays)
train

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1,0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2,0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3,0


In [17]:
train['y-m']=train['timestamp'].dt.strftime('%Y-%m')
train['y-m']

0        2019-01
1        2019-01
2        2019-01
3        2019-01
4        2019-01
          ...   
59392    2023-02
59393    2023-02
59394    2023-03
59395    2023-03
59396    2023-03
Name: y-m, Length: 59397, dtype: object

In [18]:
train['holiday'].value_counts()

holiday
0    42471
1    16926
Name: count, dtype: int64

In [19]:
trade

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,2019-01,토마토(신선한 것이나 냉장한 것으로 한정한다),356571,990,0,0,990
1,2019-01,양파,821330,222,4003206,1118,-896
2,2019-01,쪽파,60,1,93405,128,-127
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38
...,...,...,...,...,...,...,...
1269,2023-02,포포(papaw)[파파야(papaya)],0,0,23830,71,-71
1270,2023-02,사과,135165,351,0,0,351
1271,2023-02,배,2206012,5411,1,0,5411
1272,2023-02,신 체리[프루너스 체라서스(Prunus cerasus)],5,0,0,0,0


In [20]:
df_test = trade[trade.품목명.str.contains('감귤|브로콜리|무|당근|양배추')]
df_test

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38
5,2019-01,양배추,184650,94,395802,90,4
8,2019-01,당근,23150,22,7466150,2955,-2934
12,2019-01,무화과,2627,23,94529,464,-441
...,...,...,...,...,...,...,...
1250,2023-02,양배추,13188,13,377456,104,-91
1253,2023-02,당근,22510,20,9260020,3758,-3737
1254,2023-02,순무,4000,4,2,0,4
1258,2023-02,무화과,1319,14,104566,454,-440


In [21]:
df_test.loc[df_test['품목명']=='꽃양배추와 브로콜리(broccoli)','품목명'] = '브로콜리'
df_test = df_test[df_test['품목명'] != '방울다다기 양배추']
df_test.loc[df_test['품목명']=='순무','품목명'] = '무'
df_test = df_test[df_test['품목명']!='무화과']
df_test

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,브로콜리,160,1,638913,563,-562
5,2019-01,양배추,184650,94,395802,90,4
8,2019-01,당근,23150,22,7466150,2955,-2934
17,2019-01,감귤,58368,172,0,0,172
28,2019-02,브로콜리,780,1,396870,399,-398
...,...,...,...,...,...,...,...
1248,2023-02,브로콜리,24,0,332640,352,-352
1250,2023-02,양배추,13188,13,377456,104,-91
1253,2023-02,당근,22510,20,9260020,3758,-3737
1254,2023-02,무,4000,4,2,0,4


In [22]:
df_test['품목명'].value_counts()

품목명
브로콜리    50
양배추     50
당근      50
감귤      50
무        6
Name: count, dtype: int64

In [23]:
fruits_dict = {'감귤':'TG' ,'브로콜리':'BC' ,'무':'RD' ,'당근':'CR' ,'양배추':'CB'}
fruits_dict

{'감귤': 'TG', '브로콜리': 'BC', '무': 'RD', '당근': 'CR', '양배추': 'CB'}

In [24]:
df_test['품목명'] = df_test['품목명'].map(fruits_dict)

In [25]:
df_test.reset_index()

Unnamed: 0,index,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,3,2019-01,BC,160,1,638913,563,-562
1,5,2019-01,CB,184650,94,395802,90,4
2,8,2019-01,CR,23150,22,7466150,2955,-2934
3,17,2019-01,TG,58368,172,0,0,172
4,28,2019-02,BC,780,1,396870,399,-398
...,...,...,...,...,...,...,...,...
201,1248,2023-02,BC,24,0,332640,352,-352
202,1250,2023-02,CB,13188,13,377456,104,-91
203,1253,2023-02,CR,22510,20,9260020,3758,-3737
204,1254,2023-02,RD,4000,4,2,0,4


In [26]:
# 컬럼명 변경
trade = df_test.rename(columns = {'품목명' : 'item'})
trade

Unnamed: 0,기간,item,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,BC,160,1,638913,563,-562
5,2019-01,CB,184650,94,395802,90,4
8,2019-01,CR,23150,22,7466150,2955,-2934
17,2019-01,TG,58368,172,0,0,172
28,2019-02,BC,780,1,396870,399,-398
...,...,...,...,...,...,...,...
1248,2023-02,BC,24,0,332640,352,-352
1250,2023-02,CB,13188,13,377456,104,-91
1253,2023-02,CR,22510,20,9260020,3758,-3737
1254,2023-02,RD,4000,4,2,0,4


In [27]:
not_RD_trade = trade[trade['item'] != 'RD']
not_RD_trade

Unnamed: 0,기간,item,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,BC,160,1,638913,563,-562
5,2019-01,CB,184650,94,395802,90,4
8,2019-01,CR,23150,22,7466150,2955,-2934
17,2019-01,TG,58368,172,0,0,172
28,2019-02,BC,780,1,396870,399,-398
...,...,...,...,...,...,...,...
1236,2023-01,TG,81509,269,0,0,269
1248,2023-02,BC,24,0,332640,352,-352
1250,2023-02,CB,13188,13,377456,104,-91
1253,2023-02,CR,22510,20,9260020,3758,-3737


In [28]:
RD_trade = trade[trade['item'] == 'RD']
RD_trade

Unnamed: 0,기간,item,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
536,2020-10,RD,450,8,0,0,8
681,2021-04,RD,577,0,0,0,0
705,2021-05,RD,312,0,0,0,0
732,2021-06,RD,130,0,0,0,0
861,2021-11,RD,0,0,1,0,0
1254,2023-02,RD,4000,4,2,0,4


In [29]:
dr_merged = pd.merge(train, trade, left_on=['item', 'y-m'], right_on=['item', '기간'])
dr_merged

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday,y-m,기간,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,0,2019-01,2019-01,58368,172,0,0,172
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0,2019-01,2019-01,58368,172,0,0,172
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0,2019-01,2019-01,58368,172,0,0,172
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0,2019-01,2019-01,58368,172,0,0,172
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1,2019-01,2019-01,58368,172,0,0,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48555,BC_E_S_20230224,2023-02-24,BC,E,S,2104.0,2025.0,2023,2,8,4,0,2023-02,2023-02,24,0,332640,352,-352
48556,BC_E_S_20230225,2023-02-25,BC,E,S,1032.0,2353.0,2023,2,8,5,1,2023-02,2023-02,24,0,332640,352,-352
48557,BC_E_S_20230226,2023-02-26,BC,E,S,0.0,0.0,2023,2,8,6,1,2023-02,2023-02,24,0,332640,352,-352
48558,BC_E_S_20230227,2023-02-27,BC,E,S,2200.0,2488.0,2023,2,9,0,0,2023-02,2023-02,24,0,332640,352,-352


In [31]:
not_RD_train = train[train['item'] != 'RD']
not_RD_train

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday,y-m
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,0,2019-01
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0,2019-01
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0,2019-01
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0,2019-01
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1,2019-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57869,CB_F_J_20230227,2023-02-27,CB,F,J,232312.0,652.0,2023,2,9,0,0,2023-02
57870,CB_F_J_20230228,2023-02-28,CB,F,J,224072.0,672.0,2023,2,9,1,0,2023-02
57871,CB_F_J_20230301,2023-03-01,CB,F,J,273800.0,621.0,2023,3,9,2,0,2023-03
57872,CB_F_J_20230302,2023-03-02,CB,F,J,238992.0,653.0,2023,3,9,3,0,2023-03


In [32]:
RD_trade = trade[trade['item'] == 'RD']
RD_trade

Unnamed: 0,기간,item,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
536,2020-10,RD,450,8,0,0,8
681,2021-04,RD,577,0,0,0,0
705,2021-05,RD,312,0,0,0,0
732,2021-06,RD,130,0,0,0,0
861,2021-11,RD,0,0,1,0,0
1254,2023-02,RD,4000,4,2,0,4


In [33]:
dr_merged = pd.merge(train, trade, left_on=['item', 'y-m'], right_on=['item', '기간'])
dr_merged

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday,y-m,기간,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,0,2019-01,2019-01,58368,172,0,0,172
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0,2019-01,2019-01,58368,172,0,0,172
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0,2019-01,2019-01,58368,172,0,0,172
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0,2019-01,2019-01,58368,172,0,0,172
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1,2019-01,2019-01,58368,172,0,0,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48555,BC_E_S_20230224,2023-02-24,BC,E,S,2104.0,2025.0,2023,2,8,4,0,2023-02,2023-02,24,0,332640,352,-352
48556,BC_E_S_20230225,2023-02-25,BC,E,S,1032.0,2353.0,2023,2,8,5,1,2023-02,2023-02,24,0,332640,352,-352
48557,BC_E_S_20230226,2023-02-26,BC,E,S,0.0,0.0,2023,2,8,6,1,2023-02,2023-02,24,0,332640,352,-352
48558,BC_E_S_20230227,2023-02-27,BC,E,S,2200.0,2488.0,2023,2,9,0,0,2023-02,2023-02,24,0,332640,352,-352


In [34]:
not_rd_merged = pd.merge(not_RD_train, trade, left_on=['item', 'y-m'], right_on=['item', '기간'])
dr_merged

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday,y-m,기간,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,0,2019-01,2019-01,58368,172,0,0,172
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0,2019-01,2019-01,58368,172,0,0,172
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0,2019-01,2019-01,58368,172,0,0,172
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0,2019-01,2019-01,58368,172,0,0,172
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1,2019-01,2019-01,58368,172,0,0,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48555,BC_E_S_20230224,2023-02-24,BC,E,S,2104.0,2025.0,2023,2,8,4,0,2023-02,2023-02,24,0,332640,352,-352
48556,BC_E_S_20230225,2023-02-25,BC,E,S,1032.0,2353.0,2023,2,8,5,1,2023-02,2023-02,24,0,332640,352,-352
48557,BC_E_S_20230226,2023-02-26,BC,E,S,0.0,0.0,2023,2,8,6,1,2023-02,2023-02,24,0,332640,352,-352
48558,BC_E_S_20230227,2023-02-27,BC,E,S,2200.0,2488.0,2023,2,9,0,0,2023-02,2023-02,24,0,332640,352,-352


In [35]:
#원-핫 인코딩
dr_merged = pd.get_dummies(dr_merged, columns=['item', 'corporation', 'location'], drop_first=True)

In [36]:
dr_merged

Unnamed: 0,ID,timestamp,supply,price,year,month,week,weekday,holiday,y-m,...,item_CB,item_CR,item_RD,item_TG,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_S
0,TG_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1,0,2019-01,...,False,False,False,True,False,False,False,False,False,False
1,TG_A_J_20190102,2019-01-02,0.0,0.0,2019,1,1,2,0,2019-01,...,False,False,False,True,False,False,False,False,False,False
2,TG_A_J_20190103,2019-01-03,60601.0,1728.0,2019,1,1,3,0,2019-01,...,False,False,False,True,False,False,False,False,False,False
3,TG_A_J_20190104,2019-01-04,25000.0,1408.0,2019,1,1,4,0,2019-01,...,False,False,False,True,False,False,False,False,False,False
4,TG_A_J_20190105,2019-01-05,32352.0,1250.0,2019,1,1,5,1,2019-01,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48555,BC_E_S_20230224,2023-02-24,2104.0,2025.0,2023,2,8,4,0,2023-02,...,False,False,False,False,False,False,False,True,False,True
48556,BC_E_S_20230225,2023-02-25,1032.0,2353.0,2023,2,8,5,1,2023-02,...,False,False,False,False,False,False,False,True,False,True
48557,BC_E_S_20230226,2023-02-26,0.0,0.0,2023,2,8,6,1,2023-02,...,False,False,False,False,False,False,False,True,False,True
48558,BC_E_S_20230227,2023-02-27,2200.0,2488.0,2023,2,9,0,0,2023-02,...,False,False,False,False,False,False,False,True,False,True


In [37]:
TG_train = train[train['item'] == 'TG']
TG_trade = trade[trade['item'] == 'TG']

In [38]:
TG_merged = pd.merge(TG_train, TG_trade, left_on=['item', 'y-m'], right_on=['item', '기간'])

In [39]:
#원-핫 인코딩
TG_merged = pd.get_dummies(TG_merged, columns=['item', 'corporation', 'location'], drop_first=True)
TG_merged

Unnamed: 0,ID,timestamp,supply,price,year,month,week,weekday,holiday,y-m,...,수출 중량,수출 금액,수입 중량,수입 금액,무역수지,corporation_B,corporation_C,corporation_D,corporation_E,location_S
0,TG_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1,0,2019-01,...,58368,172,0,0,172,False,False,False,False,False
1,TG_A_J_20190102,2019-01-02,0.0,0.0,2019,1,1,2,0,2019-01,...,58368,172,0,0,172,False,False,False,False,False
2,TG_A_J_20190103,2019-01-03,60601.0,1728.0,2019,1,1,3,0,2019-01,...,58368,172,0,0,172,False,False,False,False,False
3,TG_A_J_20190104,2019-01-04,25000.0,1408.0,2019,1,1,4,0,2019-01,...,58368,172,0,0,172,False,False,False,False,False
4,TG_A_J_20190105,2019-01-05,32352.0,1250.0,2019,1,1,5,1,2019-01,...,58368,172,0,0,172,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15195,TG_E_S_20230224,2023-02-24,25329.2,3026.0,2023,2,8,4,0,2023-02,...,6895,34,27765,98,-64,False,False,False,True,True
15196,TG_E_S_20230225,2023-02-25,30300.0,2946.0,2023,2,8,5,1,2023-02,...,6895,34,27765,98,-64,False,False,False,True,True
15197,TG_E_S_20230226,2023-02-26,0.0,0.0,2023,2,8,6,1,2023-02,...,6895,34,27765,98,-64,False,False,False,True,True
15198,TG_E_S_20230227,2023-02-27,24204.0,3418.0,2023,2,9,0,0,2023-02,...,6895,34,27765,98,-64,False,False,False,True,True


In [40]:
#TG LGBM 모델링
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 특성과 타겟 데이터 분리
columns_to_drop = ['price', 'ID', 'timestamp', '기간','수출 중량','수입 금액','무역수지','y-m']
X = TG_merged.drop(columns=columns_to_drop)
y = TG_merged['price']

# 데이터를 학습용과 테스트용으로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM 회귀 모델 생성
TG_lgbm_model = LGBMRegressor()

# LightGBM 모델 학습
TG_lgbm_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = TG_lgbm_model.predict(X_test)

# 회귀 모델의 성능 평가 (RMSE 구하기)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 406
[LightGBM] [Info] Number of data points in the train set: 12160, number of used features: 13
[LightGBM] [Info] Start training from score 3190.266694
Root Mean Squared Error: 736.9042038638041


In [41]:
RD_train = train[train['item'] == 'RD']

In [42]:
#원-핫 인코딩
RD_encoding = pd.get_dummies(RD_train, columns=['item', 'corporation', 'location'], drop_first=True)
RD_encoding

Unnamed: 0,ID,timestamp,supply,price,year,month,week,weekday,holiday,y-m,corporation_C,corporation_D,corporation_E,corporation_F,location_S
31983,RD_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1,0,2019-01,False,False,False,False,False
31984,RD_A_J_20190102,2019-01-02,0.0,0.0,2019,1,1,2,0,2019-01,False,False,False,False,False
31985,RD_A_J_20190103,2019-01-03,37060.0,367.0,2019,1,1,3,0,2019-01,False,False,False,False,False
31986,RD_A_J_20190104,2019-01-04,19260.0,460.0,2019,1,1,4,0,2019-01,False,False,False,False,False
31987,RD_A_J_20190105,2019-01-05,32140.0,402.0,2019,1,1,5,1,2019-01,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,452440.0,468.0,2023,2,9,0,0,2023-02,False,False,False,True,False
59393,RD_F_J_20230228,2023-02-28,421980.0,531.0,2023,2,9,1,0,2023-02,False,False,False,True,False
59394,RD_F_J_20230301,2023-03-01,382980.0,574.0,2023,3,9,2,0,2023-03,False,False,False,True,False
59395,RD_F_J_20230302,2023-03-02,477220.0,523.0,2023,3,9,3,0,2023-03,False,False,False,True,False


In [43]:
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 특성과 타겟 데이터 분리
columns_to_drop = ['price', 'ID', 'timestamp','y-m']
X = RD_encoding.drop(columns=columns_to_drop)
y = RD_encoding['price']

# 데이터를 학습용과 테스트용으로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM 회귀 모델 생성
RD_lgbm_model = LGBMRegressor()

# LightGBM 모델 학습
RD_lgbm_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = RD_lgbm_model.predict(X_test)

# 회귀 모델의 성능 평가 (RMSE 구하기)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 347
[LightGBM] [Info] Number of data points in the train set: 9747, number of used features: 11
[LightGBM] [Info] Start training from score 184.617831
Root Mean Squared Error: 61.4544381090566


In [58]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# R2 점수 계산
r2 = r2_score(y_test, y_pred)

# 평균 절대 오류 (MAE) 계산
mae = mean_absolute_error(y_test, y_pred)

# 평균 제곱 오류 (MSE) 계산
mse = mean_squared_error(y_test, y_pred)

# 루트 평균 제곱 오류 (RMSE) 계산
rmse = np.sqrt(mse)

# 모델 성능 평가 결과 출력
print(f'R2 점수: {r2:.4f}')
print(f'MAE: {mae:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')

R2 점수: 0.9680
MAE: 65.6582
MSE: 37264.9427
RMSE: 193.0413


In [44]:
Other_train = train[(train['item'] == 'BC') | (train['item'] == 'CR') | (train['item'] == 'CB')]
Other_trade = trade[(trade['item'] == 'BC') | (trade['item'] == 'CR') | (trade['item'] == 'CB')]

In [45]:
Other_merged = pd.merge(Other_train, Other_trade, left_on=['item', 'y-m'], right_on=['item', '기간'])

In [46]:
#원-핫 인코딩
Other_merged = pd.get_dummies(Other_merged, columns=['item', 'corporation', 'location'], drop_first=True)
Other_merged

Unnamed: 0,ID,timestamp,supply,price,year,month,week,weekday,holiday,y-m,...,수입 금액,무역수지,item_CB,item_CR,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_S
0,CR_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1,0,2019-01,...,2955,-2934,False,True,False,False,False,False,False,False
1,CR_A_J_20190102,2019-01-02,0.0,0.0,2019,1,1,2,0,2019-01,...,2955,-2934,False,True,False,False,False,False,False,False
2,CR_A_J_20190103,2019-01-03,0.0,0.0,2019,1,1,3,0,2019-01,...,2955,-2934,False,True,False,False,False,False,False,False
3,CR_A_J_20190104,2019-01-04,10240.0,1141.0,2019,1,1,4,0,2019-01,...,2955,-2934,False,True,False,False,False,False,False,False
4,CR_A_J_20190105,2019-01-05,8680.0,1133.0,2019,1,1,5,1,2019-01,...,2955,-2934,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31915,BC_E_S_20230224,2023-02-24,2104.0,2025.0,2023,2,8,4,0,2023-02,...,352,-352,False,False,False,False,False,True,False,True
31916,BC_E_S_20230225,2023-02-25,1032.0,2353.0,2023,2,8,5,1,2023-02,...,352,-352,False,False,False,False,False,True,False,True
31917,BC_E_S_20230226,2023-02-26,0.0,0.0,2023,2,8,6,1,2023-02,...,352,-352,False,False,False,False,False,True,False,True
31918,BC_E_S_20230227,2023-02-27,2200.0,2488.0,2023,2,9,0,0,2023-02,...,352,-352,False,False,False,False,False,True,False,True


In [47]:
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 특성과 타겟 데이터 분리
columns_to_drop = ['price', 'ID', 'timestamp', '기간','수출 중량','수입 금액','무역수지','y-m']
X = Other_merged.drop(columns=columns_to_drop)
y = Other_merged['price']

# 데이터를 학습용과 테스트용으로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM 회귀 모델 생성
Other_lgbm_model = LGBMRegressor()

# LightGBM 모델 학습
Other_lgbm_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = Other_lgbm_model.predict(X_test)

# 회귀 모델의 성능 평가 (RMSE 구하기)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 565
[LightGBM] [Info] Number of data points in the train set: 25536, number of used features: 16
[LightGBM] [Info] Start training from score 512.733239
Root Mean Squared Error: 193.04129800393284


In [59]:
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score

# 특성과 타겟 데이터 분리
columns_to_drop = ['price', 'ID', 'timestamp', '기간','수출 중량','수입 금액','무역수지','y-m']
X = Other_merged.drop(columns=columns_to_drop)
y = Other_merged['price']

# K-Fold 교차 검증
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

# LightGBM 회귀 모델 생성
Other_lgbm_model = LGBMRegressor()

# K-Fold 교차 검증을 통한 평균 RMSE 계산
scores = cross_val_score(Other_lgbm_model, X, y, cv=kfold, scoring='neg_mean_squared_error')
rmse = np.sqrt(-np.mean(scores))

# 결과 출력
print(f'K-Fold 교차 검증을 통한 Root Mean Squared Error: {rmse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 565
[LightGBM] [Info] Number of data points in the train set: 25536, number of used features: 16
[LightGBM] [Info] Start training from score 512.733239
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 565
[LightGBM] [Info] Number of data points in the train set: 25536, number of used features: 16
[LightGBM] [Info] Start training from score 513.916197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno