In [44]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

In [45]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [46]:
train_df.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


In [47]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [48]:
train_df.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5


In [49]:
train_x = train_df

tg_idx = train_x[(train_x["item"]=="TG") & (train_x["price(원/kg)"]>20000)].index
rd_idx = train_x[(train_x["item"]=="RD") & (train_x["price(원/kg)"]>5000)].index
bc_idx = train_x[(train_x["item"]=="BC") & (train_x["price(원/kg)"]>8000)].index
cb_idx = train_x[(train_x["item"]=="CB") & (train_x["price(원/kg)"]>2300)].index

train_x.loc[tg_idx,"price(원/kg)"] = train_x[(train_x["item"]=="TG") & (train_x["price(원/kg)"]!=0)]["price(원/kg)"].mean()
train_x.loc[rd_idx,"price(원/kg)"] = train_x[(train_x["item"]=="RD") & (train_x["price(원/kg)"]!=0)]["price(원/kg)"].mean()
train_x.loc[bc_idx,"price(원/kg)"] = train_x[(train_x["item"]=="BC") & (train_x["price(원/kg)"]!=0)]["price(원/kg)"].mean()
train_x.loc[cb_idx,"price(원/kg)"] = train_x[(train_x["item"]=="CB") & (train_x["price(원/kg)"]!=0)]["price(원/kg)"].mean()



In [50]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_x.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

test_x = test_df.drop(columns=['ID', 'timestamp'])

In [51]:
train_x

Unnamed: 0,item,corporation,location,year,month,day
0,TG,A,J,2019,1,1
1,TG,A,J,2019,1,2
2,TG,A,J,2019,1,3
3,TG,A,J,2019,1,4
4,TG,A,J,2019,1,5
...,...,...,...,...,...,...
59392,RD,F,J,2023,2,27
59393,RD,F,J,2023,2,28
59394,RD,F,J,2023,3,1
59395,RD,F,J,2023,3,2


In [52]:
import pandas as pd

# 질적 변수 리스트
qual_col = ['item', 'corporation', 'location']

# 학습 데이터에 대해 get_dummies 적용
train_dummies = pd.get_dummies(train_x[qual_col])

# 테스트 데이터에 대해 get_dummies 적용
test_dummies = pd.get_dummies(test_x[qual_col])

# get_dummies는 학습 데이터와 테스트 데이터에서 생성된 열이 다를 수 있으므로, 열을 일치시키기
train_dummies, test_dummies = train_dummies.align(test_dummies, join='outer', axis=1, fill_value=0)

# 기존의 질적 변수를 제거하고 원핫 인코딩된 데이터를 추가
train_x = train_x.drop(columns=qual_col).join(train_dummies)
test_x = test_x.drop(columns=qual_col).join(test_dummies)

print('Done.')


Done.


In [53]:
train_x.shape

(59397, 16)

In [54]:
train_x.head()

Unnamed: 0,year,month,day,item_BC,item_CB,item_CR,item_RD,item_TG,corporation_A,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S
0,2019,1,1,False,False,False,False,True,True,False,False,False,False,False,True,False
1,2019,1,2,False,False,False,False,True,True,False,False,False,False,False,True,False
2,2019,1,3,False,False,False,False,True,True,False,False,False,False,False,True,False
3,2019,1,4,False,False,False,False,True,True,False,False,False,False,False,True,False
4,2019,1,5,False,False,False,False,True,True,False,False,False,False,False,True,False


In [55]:
model = RandomForestRegressor()
model.fit(train_x, train_y)
preds = model.predict(test_x)

In [56]:
preds

array([3628.42, 3819.43,  953.95, ...,  402.73,  405.2 ,  394.92])

In [57]:
submission = pd.read_csv('./sample_submission.csv')
submission['answer'] = preds
submission
submission.to_csv('./baseline_submission.csv', index=False)

2