# VvotingRegression (보팅 회귀)
- 여러 회귀모형 알고리즘을 결합하고 평균 예측값을 반환한다.


## 패키지 로딩

In [135]:
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import numpy as np
import pandas as pd


## 데이터 로딩, 스케일링, 분할

In [136]:
boston = pd.read_csv('dataset/HousingData.csv')
boston = boston.fillna(method='ffill')


In [137]:
X = boston.drop('MEDV',axis =1)
y = boston['MEDV']
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,2.94,36.2


In [138]:
scaled_X = StandardScaler().fit_transform(X)

In [139]:
X_train,X_test,y_train,y_test = train_test_split(scaled_X,y,test_size=0.3,random_state=10)

## 모델 생성

In [140]:
lasso = Lasso(alpha=0.03)
ridge = Ridge(alpha=1)
linear = LinearRegression()

vc_r = VotingRegressor(estimators=[('LASSO',lasso),('RIDGE',ridge),('LINEAR',linear)])
vc_r.fit(X_train,y_train)

## 예측 및 평가

In [141]:
y_hat = vc_r.predict(X_test)
r_square = vc_r.score(X_test, y_test)
print(f'결정계숫:{r_square:.3f}')
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_hat))
print(f'RMSE:{rmse:.3f}')

결정계숫:0.699
RMSE:5.425


# VotingClassifier (보팅 분류)


## 패키지 로딩

In [142]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


## 데이터 로딩, 정규화, 분할

In [143]:
X, y = load_breast_cancer(return_X_y=True)

scaled_X = StandardScaler().fit_transform(X)


In [144]:
X_train,X_test,y_train,y_test = train_test_split(scaled_X,y,train_size=0.7,random_state=0,stratify=y)

## 모델 생성 및 학습

In [145]:
logistic = LogisticRegression()
knn = KNeighborsClassifier()

# voting: 하드 보팅은 'hard', 소프트 보팅은 'soft'
vo_c = VotingClassifier(estimators=[('LOGISTIC',logistic),("KNN",knn)],voting='soft')
vo_c.fit(X_train,y_train)

## 예측 및 평가

In [146]:
y_hat = vo_c.predict(X_test)
print(f'정확도:{metrics.accuracy_score(y_test,y_hat):.3f}')
auc = metrics.roc_auc_score(y_test,vo_c.predict_proba(X_test)[:,1])
print(f'auc는 :{auc:.3f}')

정확도:0.947
auc는 :0.993


# GradientBoostingClassifier(부스팅 분류)
- 랜덤포레스트와 같이 의사결정 나무 모델을 부스팅 방법으로 활용하는 모델
- 이전 예측기가 만든 잔여 오차(residual error)에 ㅐ로운 예측기로 학습 시킴

## 패키지 로딩

In [147]:
from sklearn.ensemble import GradientBoostingClassifier

## 데이터 로딩, 정규화 ,분할

In [148]:
X, y = load_breast_cancer(return_X_y=True)

scaled_X = StandardScaler().fit_transform(X)
X_train,X_test,y_train,y_test = train_test_split(scaled_X,y,train_size=0.7,random_state=0,stratify=y)


## 모델 생성 및 학습

In [149]:
gb_c = GradientBoostingClassifier(random_state=0)
gb_c.fit(X_train,y_train)

In [150]:
y_hat = gb_c.predict(X_test)
print(f'정확도:{metrics.accuracy_score(y_test,y_hat):.3f}')
auc = metrics.roc_auc_score(y_test,gb_c.predict_proba(X_test)[:,1])
print(f'auc는 :{auc:.3f}')

정확도:0.942
auc는 :0.987


# GradientBoostingRegression (부스팅 회귀)


In [151]:
airport = pd.read_csv("비행기 연착 추측 분류/Airport Codes Dataset.csv")
airport2 =airport.rename(columns={'city':'Ori-city','state':'Ori-state','name':'Ori-airport'})
airport2

Unnamed: 0,airport_id,Ori-city,Ori-state,Ori-airport
0,10165,Adak Island,AK,Adak
1,10299,Anchorage,AK,Ted Stevens Anchorage International
2,10304,Aniak,AK,Aniak Airport
3,10754,Barrow,AK,Wiley Post/Will Rogers Memorial
4,10551,Bethel,AK,Bethel Airport
...,...,...,...,...
360,11233,Cheyenne,WY,Cheyenne Regional/Jerry Olson Field
361,11097,Cody,WY,Yellowstone Regional
362,11865,Gillette,WY,Gillette Campbell County
363,12441,Jackson,WY,Jackson Hole


In [152]:
flight = pd.read_csv("비행기 연착 추측 분류/Flight on-time performance.csv")
flight

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,DepTimeBlk,DepDelay,DepDel15,CRSArrTime,ArrTimeBlk,ArrDelay,ArrDel15,Cancelled,Diverted
0,2011,4,10,6,4,WN,13495,12191,1435,1400-1459,2.0,0.0,1550,1500-1559,-6.0,0.0,0,0
1,2011,4,10,6,4,WN,13495,12191,1330,1300-1359,-4.0,0.0,1445,1400-1459,-12.0,0.0,0,0
2,2011,4,10,6,4,WN,13495,12191,1030,1000-1059,-2.0,0.0,1145,1100-1159,-14.0,0.0,0,0
3,2011,4,10,6,4,WN,13495,12889,1900,1900-1959,0.0,0.0,2055,2000-2059,-6.0,0.0,0,0
4,2011,4,10,6,4,WN,13495,12889,1340,1300-1359,-1.0,0.0,1530,1500-1559,2.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504392,2011,4,10,17,1,DL,11433,14635,730,0700-0759,-4.0,0.0,1022,1000-1059,-10.0,0.0,0,0
504393,2011,4,10,17,1,DL,14771,11433,705,0700-0759,-7.0,0.0,1441,1400-1459,-27.0,0.0,0,0
504394,2011,4,10,17,1,DL,11433,14869,1725,1700-1759,1.0,0.0,1924,1900-1959,-4.0,0.0,0,0
504395,2011,4,10,17,1,DL,14679,12478,2243,2200-2259,9.0,0.0,700,0700-0759,-10.0,0.0,0,0


In [153]:
flight_outer = pd.merge(airport2,flight, left_on='airport_id',right_on='OriginAirportID')
print(flight_outer.shape)

(504397, 22)


In [154]:
# flight_outer.drop('airport_id', axis =1,inplace = TRUE)

In [155]:
airport = pd.read_csv("비행기 연착 추측 분류/Airport Codes Dataset.csv")
airport3 =airport.rename(columns={'city':'Dest-city','state':'Dest-state','name':'Dest-airport'})
airport3

Unnamed: 0,airport_id,Dest-city,Dest-state,Dest-airport
0,10165,Adak Island,AK,Adak
1,10299,Anchorage,AK,Ted Stevens Anchorage International
2,10304,Aniak,AK,Aniak Airport
3,10754,Barrow,AK,Wiley Post/Will Rogers Memorial
4,10551,Bethel,AK,Bethel Airport
...,...,...,...,...
360,11233,Cheyenne,WY,Cheyenne Regional/Jerry Olson Field
361,11097,Cody,WY,Yellowstone Regional
362,11865,Gillette,WY,Gillette Campbell County
363,12441,Jackson,WY,Jackson Hole


In [156]:
flight_outer = pd.merge(airport3,flight_outer, left_on='airport_id',right_on='OriginAirportID',how='left')
print(flight_outer.shape)
flight_outer

(504483, 26)


Unnamed: 0,airport_id_x,Dest-city,Dest-state,Dest-airport,airport_id_y,Ori-city,Ori-state,Ori-airport,Year,Quarter,...,CRSDepTime,DepTimeBlk,DepDelay,DepDel15,CRSArrTime,ArrTimeBlk,ArrDelay,ArrDel15,Cancelled,Diverted
0,10165,Adak Island,AK,Adak,10165.0,Adak Island,AK,Adak,2011.0,4.0,...,1800.0,1800-1859,25.0,1.0,2143.0,2100-2159,14.0,0.0,0.0,0.0
1,10165,Adak Island,AK,Adak,10165.0,Adak Island,AK,Adak,2011.0,4.0,...,1800.0,1800-1859,5.0,0.0,2143.0,2100-2159,8.0,0.0,0.0,0.0
2,10165,Adak Island,AK,Adak,10165.0,Adak Island,AK,Adak,2011.0,4.0,...,1800.0,1800-1859,-8.0,0.0,2143.0,2100-2159,-13.0,0.0,0.0,0.0
3,10165,Adak Island,AK,Adak,10165.0,Adak Island,AK,Adak,2011.0,4.0,...,1800.0,1800-1859,66.0,1.0,2143.0,2100-2159,58.0,1.0,0.0,0.0
4,10165,Adak Island,AK,Adak,10165.0,Adak Island,AK,Adak,2011.0,4.0,...,1800.0,1800-1859,22.0,1.0,2143.0,2100-2159,21.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504478,14543,Rock Springs,WY,Rock Springs Sweetwater County,14543.0,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,...,2110.0,2100-2159,-9.0,0.0,2219.0,2200-2259,-22.0,0.0,0.0,0.0
504479,14543,Rock Springs,WY,Rock Springs Sweetwater County,14543.0,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,...,2110.0,2100-2159,-20.0,0.0,2219.0,2200-2259,-31.0,0.0,0.0,0.0
504480,14543,Rock Springs,WY,Rock Springs Sweetwater County,14543.0,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,...,2110.0,2100-2159,-8.0,0.0,2219.0,2200-2259,-15.0,0.0,0.0,0.0
504481,14543,Rock Springs,WY,Rock Springs Sweetwater County,14543.0,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,...,2110.0,2100-2159,-14.0,0.0,2219.0,2200-2259,-29.0,0.0,0.0,0.0


In [157]:
flight_outer =flight_outer.drop('airport_id_x',axis=1)
flight_outer =flight_outer.drop('airport_id_y',axis=1)
flight_outer

Unnamed: 0,Dest-city,Dest-state,Dest-airport,Ori-city,Ori-state,Ori-airport,Year,Quarter,Month,DayofMonth,...,CRSDepTime,DepTimeBlk,DepDelay,DepDel15,CRSArrTime,ArrTimeBlk,ArrDelay,ArrDel15,Cancelled,Diverted
0,Adak Island,AK,Adak,Adak Island,AK,Adak,2011.0,4.0,10.0,2.0,...,1800.0,1800-1859,25.0,1.0,2143.0,2100-2159,14.0,0.0,0.0,0.0
1,Adak Island,AK,Adak,Adak Island,AK,Adak,2011.0,4.0,10.0,6.0,...,1800.0,1800-1859,5.0,0.0,2143.0,2100-2159,8.0,0.0,0.0,0.0
2,Adak Island,AK,Adak,Adak Island,AK,Adak,2011.0,4.0,10.0,9.0,...,1800.0,1800-1859,-8.0,0.0,2143.0,2100-2159,-13.0,0.0,0.0,0.0
3,Adak Island,AK,Adak,Adak Island,AK,Adak,2011.0,4.0,10.0,13.0,...,1800.0,1800-1859,66.0,1.0,2143.0,2100-2159,58.0,1.0,0.0,0.0
4,Adak Island,AK,Adak,Adak Island,AK,Adak,2011.0,4.0,10.0,16.0,...,1800.0,1800-1859,22.0,1.0,2143.0,2100-2159,21.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504478,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,10.0,15.0,...,2110.0,2100-2159,-9.0,0.0,2219.0,2200-2259,-22.0,0.0,0.0,0.0
504479,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,10.0,22.0,...,2110.0,2100-2159,-20.0,0.0,2219.0,2200-2259,-31.0,0.0,0.0,0.0
504480,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,10.0,14.0,...,2110.0,2100-2159,-8.0,0.0,2219.0,2200-2259,-15.0,0.0,0.0,0.0
504481,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,2011.0,4.0,10.0,23.0,...,2110.0,2100-2159,-14.0,0.0,2219.0,2200-2259,-29.0,0.0,0.0,0.0


In [158]:
flight_outer = flight_outer[['DayOfWeek', 'Carrier', 'DepTimeBlk', 'DepDelay', 'DepDel15', 'ArrTimeBlk', 'ArrDel15',
'Ori-city', 'Ori-state', 'Ori-airport', 'Dest-city', 'Dest-state', 'Dest-airport', 'ArrDelay']]
flight_outer

Unnamed: 0,DayOfWeek,Carrier,DepTimeBlk,DepDelay,DepDel15,ArrTimeBlk,ArrDel15,Ori-city,Ori-state,Ori-airport,Dest-city,Dest-state,Dest-airport,ArrDelay
0,7.0,AS,1800-1859,25.0,1.0,2100-2159,0.0,Adak Island,AK,Adak,Adak Island,AK,Adak,14.0
1,4.0,AS,1800-1859,5.0,0.0,2100-2159,0.0,Adak Island,AK,Adak,Adak Island,AK,Adak,8.0
2,7.0,AS,1800-1859,-8.0,0.0,2100-2159,0.0,Adak Island,AK,Adak,Adak Island,AK,Adak,-13.0
3,4.0,AS,1800-1859,66.0,1.0,2100-2159,1.0,Adak Island,AK,Adak,Adak Island,AK,Adak,58.0
4,7.0,AS,1800-1859,22.0,1.0,2100-2159,1.0,Adak Island,AK,Adak,Adak Island,AK,Adak,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504478,6.0,OO,2100-2159,-9.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-22.0
504479,6.0,OO,2100-2159,-20.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-31.0
504480,5.0,OO,2100-2159,-8.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-15.0
504481,7.0,OO,2100-2159,-14.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-29.0


In [159]:
flight_outer.isna().sum()
flight_outer =flight_outer.dropna()
flight_outer

Unnamed: 0,DayOfWeek,Carrier,DepTimeBlk,DepDelay,DepDel15,ArrTimeBlk,ArrDel15,Ori-city,Ori-state,Ori-airport,Dest-city,Dest-state,Dest-airport,ArrDelay
0,7.0,AS,1800-1859,25.0,1.0,2100-2159,0.0,Adak Island,AK,Adak,Adak Island,AK,Adak,14.0
1,4.0,AS,1800-1859,5.0,0.0,2100-2159,0.0,Adak Island,AK,Adak,Adak Island,AK,Adak,8.0
2,7.0,AS,1800-1859,-8.0,0.0,2100-2159,0.0,Adak Island,AK,Adak,Adak Island,AK,Adak,-13.0
3,4.0,AS,1800-1859,66.0,1.0,2100-2159,1.0,Adak Island,AK,Adak,Adak Island,AK,Adak,58.0
4,7.0,AS,1800-1859,22.0,1.0,2100-2159,1.0,Adak Island,AK,Adak,Adak Island,AK,Adak,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504478,6.0,OO,2100-2159,-9.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-22.0
504479,6.0,OO,2100-2159,-20.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-31.0
504480,5.0,OO,2100-2159,-8.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-15.0
504481,7.0,OO,2100-2159,-14.0,0.0,2200-2259,0.0,Rock Springs,WY,Rock Springs Sweetwater County,Rock Springs,WY,Rock Springs Sweetwater County,-29.0


In [160]:
print(flight_outer['Carrier'].unique())
# print(flight_outer['Ori-city'].unique())
# print(flight_outer['Ori-state'].unique())
# print(flight_outer['Ori-airport'].unique())
# print(flight_outer['Dest-city'].unique())
# print(flight_outer['Dest-state'].unique())
# print(flight_outer['Dest-airport'].unique())

['AS' 'US' 'DL' 'F9' 'CO' 'WN' 'XE' 'YV' 'OO' 'EV' 'MQ' 'AA' 'FL' 'UA'
 'HA' 'B6']


In [161]:
flight_outer.info()

<class 'pandas.core.frame.DataFrame'>
Index: 499680 entries, 0 to 504482
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   DayOfWeek     499680 non-null  float64
 1   Carrier       499680 non-null  object 
 2   DepTimeBlk    499680 non-null  object 
 3   DepDelay      499680 non-null  float64
 4   DepDel15      499680 non-null  float64
 5   ArrTimeBlk    499680 non-null  object 
 6   ArrDel15      499680 non-null  float64
 7   Ori-city      499680 non-null  object 
 8   Ori-state     499680 non-null  object 
 9   Ori-airport   499680 non-null  object 
 10  Dest-city     499680 non-null  object 
 11  Dest-state    499680 non-null  object 
 12  Dest-airport  499680 non-null  object 
 13  ArrDelay      499680 non-null  float64
dtypes: float64(5), object(9)
memory usage: 57.2+ MB


In [162]:
import warnings
warnings.filterwarnings(action='ignore')

In [163]:
for col in flight_outer.columns:
    if flight_outer[col].dtype == 'object':
        unique_value = np.sort(flight_outer[col].unique())
        map_data = { c:i for i, c in enumerate(unique_value)}
        flight_outer[col] = flight_outer[col].map(map_data)
        # flight_outer[col] = LabelEncoder().fit_transform(flight_outer[col])

In [164]:
flight_outer.head()

Unnamed: 0,DayOfWeek,Carrier,DepTimeBlk,DepDelay,DepDel15,ArrTimeBlk,ArrDel15,Ori-city,Ori-state,Ori-airport,Dest-city,Dest-state,Dest-airport,ArrDelay
0,7.0,1,13,25.0,1.0,16,0.0,1,0,2,1,0,2,14.0
1,4.0,1,13,5.0,0.0,16,0.0,1,0,2,1,0,2,8.0
2,7.0,1,13,-8.0,0.0,16,0.0,1,0,2,1,0,2,-13.0
3,4.0,1,13,66.0,1.0,16,1.0,1,0,2,1,0,2,58.0
4,7.0,1,13,22.0,1.0,16,1.0,1,0,2,1,0,2,21.0


In [165]:
flight_outer['ArrDel15'].value_counts()

ArrDel15
0.0    431461
1.0     68219
Name: count, dtype: int64

## 데이터 분할

In [166]:
flight_outer.isna().sum()
flight_outer= flight_outer.dropna()

In [167]:
from sklearn.model_selection import train_test_split

X = flight_outer.drop('ArrDel15',axis = 1)
y = flight_outer['ArrDel15']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y)

In [168]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [176]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_hat = model.predict(X_test)
pred_prob_posit = model.predict_proba(X_test)[:1]
print(f'정확도:{accuracy_score(y_test,y_hat)}')
# print(f'AUC: { roc_auc_score(y_test,pred_prob_posit)}')

정확도:1.0


## Feature Importance

In [172]:
importance = {k:v for k,v in zip(flight_outer.columns,model.feature_importances_)}
df_importance = pd.DataFrame(pd.Series(importance),columns= ['importance']).sort_values('importance',ascending=False)
df_importance

Unnamed: 0,importance
Dest-airport,0.662271
DepDelay,0.170014
DepDel15,0.158456
ArrTimeBlk,0.002334
Carrier,0.001786
DepTimeBlk,0.001696
DayOfWeek,0.000935
Ori-airport,0.000451
Dest-state,0.000435
Ori-state,0.000427
