In [None]:
# Used car 데이터 세트
# 주어진 훈련 데이터 세트를 활용해 중고차 판매 가격을 예측
# 예측 결과를 csv 파일로 제출

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [11]:
X_train = pd.read_csv('5_4_x_train.csv', encoding='CP949')
y_train = pd.read_csv('5_4_y_train.csv', encoding='CP949')
X_test = pd.read_csv('5_4_x_test.csv', encoding='CP949')

In [12]:
X_train

Unnamed: 0,id,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,1,A1,2019,Automatic,3500,Petrol,145,40.9,2.0
1,2,RS4,2020,Semi-Auto,2500,Petrol,145,28.8,2.9
2,3,A8,2019,Semi-Auto,500,Diesel,145,40.4,3.0
3,4,Q5,2019,Semi-Auto,5089,Diesel,150,38.2,2.0
4,5,A5,2020,Semi-Auto,4951,Diesel,145,51.4,2.0
...,...,...,...,...,...,...,...,...,...
7463,7464,A3,2016,Manual,22633,Petrol,30,58.9,1.4
7464,7465,Q2,2017,Manual,13272,Diesel,145,64.2,1.6
7465,7466,A1,2019,Semi-Auto,5000,Petrol,145,40.9,2.0
7466,7467,Q3,2019,Manual,10,Diesel,145,42.8,2.0


In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7468 entries, 0 to 7467
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            7468 non-null   int64  
 1   model         7468 non-null   object 
 2   year          7468 non-null   int64  
 3   transmission  7468 non-null   object 
 4   mileage       7468 non-null   int64  
 5   fuelType      7468 non-null   object 
 6   tax           7468 non-null   int64  
 7   mpg           7468 non-null   float64
 8   engineSize    7468 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 525.2+ KB


In [14]:
X_train.drop(columns=['id'], inplace=True)
X_test.drop(columns=['id'], inplace=True)

print(X_train.shape, y_train.shape, X_test.shape)

(7468, 8) (7468, 2) (3200, 8)


In [15]:
X_train.head()

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2019,Automatic,3500,Petrol,145,40.9,2.0
1,RS4,2020,Semi-Auto,2500,Petrol,145,28.8,2.9
2,A8,2019,Semi-Auto,500,Diesel,145,40.4,3.0
3,Q5,2019,Semi-Auto,5089,Diesel,150,38.2,2.0
4,A5,2020,Semi-Auto,4951,Diesel,145,51.4,2.0


In [16]:
y_train.head()

Unnamed: 0,id,price
0,1,21350
1,2,69691
2,3,42950
3,4,31470
4,5,27495


In [18]:
X_train.isnull().sum()

model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [19]:
y_train.isnull().sum()

id       0
price    0
dtype: int64

In [20]:
X_test.isnull().sum()

model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [22]:
X_train_onehot = pd.get_dummies(X_train, drop_first=True)
X_test_onehot = pd.get_dummies(X_test, drop_first=True)

In [43]:
X_test_onehot = X_test_onehot.reindex(columns=X_train_onehot.columns, fill_value=0)
X_test_onehot

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_ A2,model_ A3,model_ A4,model_ A5,model_ A6,...,model_ S4,model_ S5,model_ S8,model_ SQ5,model_ SQ7,model_ TT,transmission_Manual,transmission_Semi-Auto,fuelType_Hybrid,fuelType_Petrol
0,2020,4000,145,54.3,3.0,0,False,True,False,False,...,False,0,0,False,False,False,False,True,False,False
1,2016,31300,30,61.4,2.0,0,False,False,False,True,...,False,0,0,False,False,False,True,False,False,False
2,2019,11296,150,38.2,2.0,0,False,False,False,False,...,False,0,0,False,False,False,False,True,False,False
3,2018,6500,145,55.4,1.4,0,False,False,False,False,...,False,0,0,False,False,False,True,False,False,True
4,2016,31524,30,60.1,1.4,0,True,False,False,False,...,False,0,0,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,2018,15000,150,54.3,1.4,0,False,False,False,False,...,False,0,0,False,False,False,True,False,False,True
3196,2016,29793,0,80.7,1.6,0,False,False,False,False,...,False,0,0,False,False,False,True,False,False,False
3197,2017,27554,145,61.4,2.0,0,False,False,False,True,...,False,0,0,False,False,False,True,False,False,False
3198,2019,5711,145,41.5,1.5,0,True,False,False,False,...,False,0,0,False,False,False,False,True,False,True


In [30]:
# 선형회귀 모델
linear_model = LinearRegression()
linear_model.fit(X_train_onehot, y_train)
y_pred_linear = linear_model.predict(X_train_onehot)
rmse_linear = np.sqrt(mean_squared_error(y_train, y_pred_linear))
print(rmse_linear)

3042.6282272924996


In [44]:
y_pred_linear

array([[ 3755.34858345, 27429.64373096],
       [ 3236.27285433, 58047.03412042],
       [ 3348.84312569, 41038.28228144],
       ...,
       [ 3641.60769613, 27485.00953278],
       [ 3573.17064988, 29415.66124726],
       [ 3766.20205515, 23216.28747174]])

In [32]:
# 랜덤포레스트
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_train_onehot, y_train)
y_pred_forest = random_forest_model.predict(X_train_onehot)
rmse_rf = np.sqrt(mean_squared_error(y_train, y_pred_forest))
print(rmse_rf)

949.9649269050825


In [40]:
print(y_pred_forest)

[[ 1631.32       21944.96      ]
 [ 1115.41       69435.29      ]
 [ 1343.14       43328.78      ]
 ...
 [ 4357.62275    21417.2925    ]
 [ 5873.4975     29224.30910714]
 [ 6121.71       16531.77      ]]


In [33]:
# 신경망
nn_model = MLPRegressor(max_iter=500)
nn_model.fit(X_train_onehot, y_train)
y_pred_nn = nn_model.predict(X_train_onehot)
rmse_nn = np.sqrt(mean_squared_error(y_train, y_pred_nn))
print(rmse_nn)

4188.879782364662




In [45]:
result = pd.DataFrame({
    'id': range(1, len(y_pred_forest)+1),
    'price': y_pred_forest })

print(result)

ValueError: Per-column arrays must each be 1-dimensional