In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# 주어진 train 데이터 확인
df_train = pd.read_csv('/content/drive/MyDrive/DACON/전기차 가격 예측 해커톤: 데이터로 EV를 읽다!/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/DACON/전기차 가격 예측 해커톤: 데이터로 EV를 읽다!/test.csv')
df_train

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.000,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.200,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02
...,...,...,...,...,...,...,...,...,...,...,...
7492,TRAIN_7492,H사,ION5,Brand New,,AWD,3773,10,No,0,35.95
7493,TRAIN_7493,B사,i3,Pre-Owned,46.000,RWD,135411,2,No,0,23.40
7494,TRAIN_7494,P사,TayCT,Brand New,,AWD,1363,2,No,0,120.00
7495,TRAIN_7495,B사,i3,Nearly New,56.000,RWD,39445,6,No,2,24.00


In [3]:
# train 데이터 정보 확인
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [4]:
# 배터리용량을 제거한 후 가격을 예측
df_train.drop(['ID', '배터리용량'], axis = 1, inplace = True)
df_train

Unnamed: 0,제조사,모델,차량상태,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,P사,TayGTS,Nearly New,AWD,13642,0,No,2,159.66
1,K사,Niro,Nearly New,FWD,10199,6,No,0,28.01
2,A사,eT,Brand New,AWD,2361,7,No,0,66.27
3,A사,RSeTGT,Nearly New,AWD,21683,3,No,0,99.16
4,B사,i5,Pre-Owned,AWD,178205,1,No,0,62.02
...,...,...,...,...,...,...,...,...,...
7492,H사,ION5,Brand New,AWD,3773,10,No,0,35.95
7493,B사,i3,Pre-Owned,RWD,135411,2,No,0,23.40
7494,P사,TayCT,Brand New,AWD,1363,2,No,0,120.00
7495,B사,i3,Nearly New,RWD,39445,6,No,2,24.00


In [5]:
# 문자형 데이터를 LabelEncoding 변환
column_list = list(df_train.columns)
column_list = column_list[0:8]

for i in column_list:
  encoder = LabelEncoder()
  encoder.fit(df_train[i])
  df_train[i] = encoder.transform(df_train[i])

df_train

Unnamed: 0,제조사,모델,차량상태,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,4,16,1,0,3040,0,0,2,159.66
1,3,10,1,1,2878,6,0,0,28.01
2,0,17,0,0,695,7,0,0,66.27
3,0,12,1,0,3449,3,0,0,99.16
4,1,19,2,0,6606,1,0,0,62.02
...,...,...,...,...,...,...,...,...,...
7492,2,2,0,0,1097,10,0,0,35.95
7493,1,18,2,2,6056,2,0,0,23.40
7494,4,15,0,0,397,2,0,0,120.00
7495,1,18,1,2,4357,6,0,2,24.00


In [37]:
# RandomForestRegressor 객체 생성
rf_rgs = RandomForestRegressor(n_estimators = 89, max_depth = 9, random_state = 42)

# x, y 정의
x = df_train.drop(columns = ['가격(백만원)'])
y = df_train['가격(백만원)']

# 데이터 분할 및 모델학습
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
rf_rgs.fit(x_train, y_train)

# 예측값 도출
y_pred = rf_rgs.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE Score: {rmse}')

RMSE Score: 1.4634575984261655


In [38]:
# test.csv 데이터를 LabelEncoding 변환
df_test.drop(['배터리용량'], axis = 1, inplace = True)

column_list_new = list(df_test.columns)
column_list_new = column_list[1:-1]

for i in column_list:
  encoder = LabelEncoder()
  encoder.fit(df_test[i])
  df_test[i] = encoder.transform(df_test[i])

df_test

Unnamed: 0,ID,제조사,모델,차량상태,구동방식,주행거리(km),보증기간(년),사고이력,연식(년)
0,TEST_000,4,15,1,0,398,2,0,0
1,TEST_001,1,20,0,0,267,8,0,0
2,TEST_002,1,19,0,2,254,7,1,0
3,TEST_003,2,2,1,0,376,7,0,1
4,TEST_004,3,0,0,1,272,10,0,0
...,...,...,...,...,...,...,...,...,...
841,TEST_841,4,16,2,0,712,2,0,0
842,TEST_842,6,1,2,0,651,0,0,0
843,TEST_843,6,1,2,0,725,0,0,0
844,TEST_844,0,11,1,0,408,4,0,0


In [39]:
# 학습된 모델을 이용한 가격 예측
x = df_test.drop(columns = ['ID'])
y_price = rf_rgs.predict(x)

df_trial = pd.DataFrame({'ID':df_test['ID'], '가격(백만원)':y_price})
df_trial

Unnamed: 0,ID,가격(백만원)
0,TEST_000,125.939305
1,TEST_001,79.993056
2,TEST_002,64.872232
3,TEST_003,35.456248
4,TEST_004,47.894264
...,...,...
841,TEST_841,154.114964
842,TEST_842,39.000701
843,TEST_843,39.000701
844,TEST_844,59.190308


In [40]:
# 제출 파일 생성
df_trial.to_csv('/content/drive/MyDrive/DACON/전기차 가격 예측 해커톤: 데이터로 EV를 읽다!/submission_0109.csv', index = False)