# MLP 실습 : 의사결정나무 (분류)

## data/library 불러오기

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data=pd.read_csv("breast-cancer-wisconsin.csv")
x=data[data.columns[1:10]]
y=data[['Class']]

In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,stratify=y,random_state=42)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x_train)
x_scaled_train=scaler.transform(x_train)
x_scaled_test=scaler.transform(x_test)

## model 학습하기

In [6]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_scaled_train,y_train)
pred_train=model.predict(x_scaled_train)
model.score(x_scaled_train,y_train)

1.0

## 결과 확인하기

In [7]:
pred_test=model.predict(x_scaled_test)
model.score(x_scaled_test,y_test)

0.9532163742690059

In [8]:
from sklearn.metrics import confusion_matrix
confusion_train=confusion_matrix(y_train,pred_train)
print(confusion_train)

[[333   0]
 [  0 179]]


In [10]:
from sklearn.metrics import confusion_matrix
confusion_test=confusion_matrix(y_test,pred_test)
print(confusion_test)

[[106   5]
 [  3  57]]


In [11]:
from sklearn.metrics import classification_report
cfreport_train=classification_report(y_train,pred_train)
print(cfreport_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [12]:
cfreport_test=classification_report(y_test,pred_test)
print(cfreport_test)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       111
           1       0.92      0.95      0.93        60

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



## GridSerach

In [13]:
param_grid={'max_depth':range(2,20,2),"min_samples_leaf":range(1,50,2)}

In [14]:
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(DecisionTreeClassifier(),param_grid,cv=5)
# 파라미터 찾기
grid_search.fit(x_scaled_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [15]:
print("Best Parameter : {}".format(grid_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(grid_search.best_score_))
print('Test set Score : {:.4f}'.format(grid_search.score(x_scaled_test,y_test)))

Best Parameter : {'max_depth': 10, 'min_samples_leaf': 1}
Best Cross-validity Score : 0.9687
Test set Score : 0.9474


## RandomSerach

In [26]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
param_distribs={"max_depth":randint(low=1,high=20),"min_samples_leaf":randint(low=1,high=50)}
random_search=RandomizedSearchCV(DecisionTreeClassifier(),param_distributions=param_distribs,n_iter=1000,cv=5)
# 파라미터 찾기
random_search.fit(x_scaled_train,y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=1000,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000293641FF490>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000293641FFA60>})

In [27]:
print("Best Parameter : {}".format(random_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(random_search.best_score_))
print('Test set Score : {:.4f}'.format(random_search.score(x_scaled_test,y_test)))

Best Parameter : {'max_depth': 6, 'min_samples_leaf': 1}
Best Cross-validity Score : 0.9648
Test set Score : 0.9415


# MLP 실습 : 의사결정나무 (회귀)

## data/library 불러오기

In [33]:
data=pd.read_csv("house_price.csv")
x=data[data.columns[1:5]]
y=data[['house_value']]
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state=42)

In [34]:
scaler=MinMaxScaler()
scaler.fit(x_train)
x_scaled_train=scaler.transform(x_train)
x_scaled_test=scaler.transform(x_test)

## model 학습하기

In [40]:
from sklearn.tree import DecisionTreeRegressor
model=DecisionTreeRegressor()
model.fit(x_scaled_train,y_train)

DecisionTreeRegressor()

In [41]:
pred_train=model.predict(x_scaled_train)
model.score(x_scaled_train,y_train)

1.0

In [43]:
pred_test=model.predict(x_scaled_test)
model.score(x_scaled_test,y_test)

0.21398771529907878

## RMSE 확인하기

In [44]:
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train=mean_squared_error(y_train,pred_train)
MSE_test=mean_squared_error(y_test,pred_test)
print(np.sqrt(MSE_train))
print(np.sqrt(MSE_test))

0.0
84757.30531867911


## GridSearch

In [45]:
param_grid={'max_depth':range(2,20,2),"min_samples_leaf":range(1,50,2)}

In [47]:
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(DecisionTreeRegressor(),param_grid,cv=5)
# 파라미터 찾기
grid_search.fit(x_scaled_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [48]:
print("Best Parameter : {}".format(grid_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(grid_search.best_score_))
print('Test set Score : {:.4f}'.format(grid_search.score(x_scaled_test,y_test)))

Best Parameter : {'max_depth': 8, 'min_samples_leaf': 49}
Best Cross-validity Score : 0.5592
Test set Score : 0.5770


## RandomSerach

In [49]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
param_distribs={"max_depth":randint(low=1,high=20),"min_samples_leaf":randint(low=1,high=50)}
random_search=RandomizedSearchCV(DecisionTreeRegressor(),param_distributions=param_distribs,n_iter=1000,cv=5)
# 파라미터 찾기
random_search.fit(x_scaled_train,y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_iter=1000,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000293000FA4C0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000029363C82460>})

In [50]:
print("Best Parameter : {}".format(random_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(random_search.best_score_))
print('Test set Score : {:.4f}'.format(random_search.score(x_scaled_test,y_test)))

Best Parameter : {'max_depth': 8, 'min_samples_leaf': 49}
Best Cross-validity Score : 0.5592
Test set Score : 0.5770
