## 회귀예측문제
- 성능이 우수한 예측모형 구축위해 적절한 데이터 전처리
- 피쳐엔지니어링, 분류알고리즘, 초매개변수 최적화, 모형 앙상블
- 수험번호로 파일만들기
- 제출한 모형의 성능은 RMSE, MAE가 평가지표
- 종속변수 mpg

In [None]:
#데이터 파일 일기
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = sns.load_dataset('mpg')
X_train, X_test, y_train, y_test = train_test_split(df, df['mpg'], test_size=0.2, random_state=42)
X_train = X_train.drop(['mpg'], axis=1)
X_test = X_test.drop(['mpg'], axis=1)


In [None]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
3,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
18,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
376,4,91.0,68.0,2025,18.2,82,japan,mazda glc custom l
248,4,91.0,60.0,1800,16.4,78,japan,honda civic cvcc
177,4,115.0,95.0,2694,15.0,75,europe,audi 100ls


In [None]:
#결측치제거
print(X_train.isna().sum())

cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64


In [None]:
X_train['horsepower'] = X_train['horsepower'].fillna(X_train['horsepower'].median())
X_test['horsepower'] = X_test['horsepower'].fillna(X_test['horsepower'].median())

In [None]:
print(X_train.isna().sum())

cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64


In [None]:
#label 인코더
#문자열
label = ['origin','name']
from sklearn.preprocessing import LabelEncoder
X_train[label] = X_train[label].apply(LabelEncoder().fit_transform)
X_test[label] = X_test[label].apply(LabelEncoder().fit_transform)

In [None]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
3,8,304.0,150.0,3433,12.0,70,2,10
18,4,97.0,88.0,2130,14.5,70,1,78
376,4,91.0,68.0,2025,18.2,82,1,149
248,4,91.0,60.0,1800,16.4,78,1,143
177,4,115.0,95.0,2694,15.0,75,0,13


In [None]:
#카테고리 변환, 더미처리
category = ['origin']
for i in category:
  X_train[i] = X_train[i].astype('category')
  X_test[i] = X_test[i].astype('category')
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [None]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2
3,8,304.0,150.0,3433,12.0,70,10,0,0,1
18,4,97.0,88.0,2130,14.5,70,78,0,1,0
376,4,91.0,68.0,2025,18.2,82,149,0,1,0
248,4,91.0,60.0,1800,16.4,78,143,0,1,0
177,4,115.0,95.0,2694,15.0,75,13,1,0,0


In [None]:
#파생변수 만들기

X_train['horsepower_qcut']  = pd.qcut(X_train['horsepower'], 5, labels = False)
X_test['horsepower_qcut']  = pd.qcut(X_test['horsepower'], 5, labels = False)

In [None]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
3,8,304.0,150.0,3433,12.0,70,10,0,0,1,4
18,4,97.0,88.0,2130,14.5,70,78,0,1,0,1
376,4,91.0,68.0,2025,18.2,82,149,0,1,0,0
248,4,91.0,60.0,1800,16.4,78,143,0,1,0,0
177,4,115.0,95.0,2694,15.0,75,13,1,0,0,2


In [None]:
#5. scale 작업
from sklearn.preprocessing import MinMaxScaler
scaler = ['displacement','horsepower','weight']
min = MinMaxScaler()
min.fit(X_train[scaler])

X_train[scaler] = min.transform(X_train[scaler])
X_test[scaler] = min.transform(X_test[scaler])

In [None]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
3,8,0.609819,0.581006,0.516019,12.0,70,10,0,0,1,4
18,4,0.074935,0.234637,0.146583,14.5,70,78,0,1,0,1
376,4,0.059432,0.122905,0.116813,18.2,82,149,0,1,0,0
248,4,0.059432,0.078212,0.05302,16.4,78,143,0,1,0,0
177,4,0.121447,0.273743,0.306493,15.0,75,13,1,0,0,2


In [None]:
#6. 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train, test_size=0.2, random_state=42)


In [None]:
print(X_train.shape)
print(X_valid.shape)

(254, 11)
(64, 11)


In [None]:
#모델학습
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X_train,y_train)
pred1 = model1.predict(X_valid)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model2= RandomForestRegressor()
model2.fit(X_train,y_train)
pred2 = model2.predict(X_valid)

In [None]:
#8. 앙상블(스태킹)
#예측한 결과값 메타데이터로 만들어서 다시예측
from sklearn.ensemble import StackingRegressor
estimators = [('lr',model1),('rf',model2)]
model3 = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor())
model3.fit(X_train,y_train)
pred3 = model3.predict(X_valid)

In [None]:
pred3

array([22.653, 32.601, 13.88 , 15.33 , 12.05 , 20.3  , 14.175, 22.539,
       23.328, 19.692, 32.972, 35.22 , 33.331, 23.797, 17.255, 15.605,
       33.595, 27.143, 20.917, 23.271, 20.451, 26.68 , 23.178, 18.79 ,
       18.689, 27.924, 19.029, 33.058, 25.479, 11.68 , 39.9  , 12.29 ,
       26.642, 27.705, 20.641, 22.519, 26.939, 13.755, 18.311, 15.947,
       32.732, 33.092, 38.843, 29.685, 33.177, 35.197, 33.11 , 22.358,
       13.81 , 27.139, 34.063, 14.1  , 19.999, 24.865, 21.06 , 32.456,
       27.477, 17.566, 17.435, 18.193, 16.125, 24.354, 24.319, 13.88 ])

In [None]:
#9 모형평가
from sklearn.metrics import mean_squared_error
print('선형회귀 MSE', mean_squared_error(y_valid,pred1))
print('랜포 MSE', mean_squared_error(y_valid,pred2))
print('스태킹 MSE', mean_squared_error(y_valid,pred3))

선형회귀 MSE 12.96661033747
랜포 MSE 9.59330178125
스태킹 MSE 11.642033187499997


In [None]:
#랜포 성능이 좋음
from sklearn.metrics import mean_squared_error

print('선형회귀 RMSE', np.sqrt(mean_squared_error(y_valid,pred1)))
print('랜포 RMSE',  np.sqrt(mean_squared_error(y_valid,pred2)))
print('스태킹 RMSE', np.sqrt(mean_squared_error(y_valid,pred3)))

선형회귀 RMSE 3.6009179853851156
랜포 RMSE 3.0973055679493426
스태킹 RMSE 3.4120423777409328


In [None]:
#하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[50,100],'max_depth':[4,6]}
model4 = RandomForestRegressor()
clf = GridSearchCV(estimator=model4, param_grid=parameters,cv=3)
#독립변수, 종속변수
clf.fit(X_train,y_train)
print("최적의 파라미터:",clf.best_params_)

최적의 파라미터: {'max_depth': 6, 'n_estimators': 100}


In [None]:
#파일저장
#랜포가 제일 좋앗으니까
result = pd.DataFrame(model2.predict(X_test))
result = result.iloc[:,0]
pd.DataFrame({'id':X_test.index,'result':result}).to_csv('00400.csv',index=False)

In [None]:
check = pd.read_csv('00400.csv')
check.head()

Unnamed: 0,id,result
0,198,30.256
1,396,29.382
2,33,20.743
3,208,15.363
4,93,14.385
