# 회귀나무 예제 (CAT 점수 데이터)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('copdcat.csv' ,header=0)
X=df.drop(["CATScore"],axis=1)
y=df["CATScore"]

### max_depth=2 트리

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
r_tree = DecisionTreeRegressor(max_depth=2,random_state=0) 
r_tree.fit(X, y)
plt.figure(figsize=(10,7))
plot_tree(r_tree, feature_names=X.columns, filled=True, fontsize=12) 
plt.show()

### min_samples_split=1000 트리

In [None]:
r_tree = DecisionTreeRegressor(min_samples_split=1000,random_state=0) 
r_tree.fit(X, y)
plt.figure(figsize=(15,10))
plot_tree(r_tree, feature_names=X.columns, filled=True, fontsize=12) 
plt.show()

### min_impurity_decrease=0.1 트리

In [None]:
r_tree = DecisionTreeRegressor(min_impurity_decrease=0.1,random_state=0) 
r_tree.fit(X, y)
plt.figure(figsize=(15,10))
plot_tree(r_tree, feature_names=X.columns, filled=True, fontsize=12) 
plt.show()

### Alpha=0.1 트리

In [None]:
r_tree = DecisionTreeRegressor(ccp_alpha=0.1,random_state=0) 
r_tree.fit(X, y)
plt.figure(figsize=(15,10))
plot_tree(r_tree, feature_names=X.columns, filled=True, fontsize=12) 
plt.show()

### Pruning

In [None]:
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
g_cv = GridSearchCV(DecisionTreeRegressor(random_state=0),
              param_grid={'ccp_alpha': np.arange(0.00, 0.2, 0.01)}, cv=10)
g_cv.fit(X, y)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

### Alpha=0.01 트리

In [None]:
r_tree = DecisionTreeRegressor(ccp_alpha=0.01,random_state=0) 
r_tree.fit(X, y)
plt.figure(figsize=(20,15))
plot_tree(r_tree, feature_names=X.columns, filled=True, fontsize=6) 
plt.show()

# 다른 회귀분석 방법과의 비교

## 데이터 구분 : 학습 데이터와 검증데이터

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

### 회귀나무 하이퍼파라미터 튜닝

In [None]:
np.random.seed(0)
g_cv = GridSearchCV(DecisionTreeRegressor(random_state=0),
              param_grid={'ccp_alpha': np.arange(0.000, 0.01, 0.001),
                          'min_impurity_decrease': np.arange(0,0.01,0.001),
                          'min_samples_split': np.arange(2,402,100)},
                    cv=10, n_jobs=-1)
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

### Alpha=0.0, min_impurity_decrease=0.007,  min_samples_split=2  트리

In [None]:
r1_tree = DecisionTreeRegressor(ccp_alpha=0.0,min_impurity_decrease=0.007,min_samples_split=2,random_state=0)
r1_tree.fit(X_train, y_train)

### Default 트리

In [None]:
r2_tree = DecisionTreeRegressor(random_state=0) 
r2_tree.fit(X_train, y_train)

### 선형 회귀분석

In [None]:
from sklearn.linear_model import LinearRegression
r_linear = LinearRegression(fit_intercept = True)
r_linear.fit(X_train ,y_train)

### 신경망분석

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
r_nn = MLPRegressor(hidden_layer_sizes=(5), random_state=0, max_iter = 1000)
r_nn.fit(X_train_scaled, y_train)

### SVR

In [None]:
from sklearn.svm import SVR, SVC
r_svr = SVR(kernel='rbf') 
r_svr.fit(X_train_scaled, y_train)

### 랜덤포레스트

In [None]:
from sklearn.ensemble import RandomForestRegressor
r_rf = RandomForestRegressor(random_state=0)
r_rf.fit(X_train,y_train)

### 평균절대오차비율 (MAPE) 비교

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
print('Pruned Tree      :',mean_absolute_percentage_error(y_test,r1_tree.predict(X_test)))
print('Default Tree     :',mean_absolute_percentage_error(y_test,r2_tree.predict(X_test)))
print('Linear Regression:',mean_absolute_percentage_error(y_test,r_linear.predict(X_test)))
print('Neural Nerwork   :',mean_absolute_percentage_error(y_test,r_nn.predict(X_test_scaled)))
print('SV Regression    :',mean_absolute_percentage_error(y_test,r_svr.predict(X_test_scaled)))
print('Random Forest    :',mean_absolute_percentage_error(y_test,r_rf.predict(X_test)))