In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV


def load_data():
    df = sns.load_dataset("tips")
    # Directly assign labels
    df["sex"] = df["sex"].apply(lambda x: 1 if x == "Male" else 0)
    return df


tips = load_data()

# Data processing
X = tips.drop("sex", axis=1)
y = tips["sex"]

# 100개의 컴럼 존재 : numeric, object
# select_types
categorical_features = X.select_dtypes(include=["object", "category"]).columns          # 카테고리 형태
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns              # 숫자 형태 

print(categorical_features, numerical_features)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),  # handle_unknown='ignore' : 알수없는 값은 무시
    ]
)

Index(['smoker', 'day', 'time'], dtype='object') Index(['total_bill', 'tip', 'size'], dtype='object')


In [2]:
# 파이프 라인 생성 
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())])

In [3]:
# 데이터 분할 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(195, 6) (49, 6) (195,) (49,)


In [4]:
# RandomSearch 설정 
# 파라미터 설정
param_dist = {
  'classifier__max_depth': np.concatenate((np.arange(3,11), [None])), 
  'classifier__min_samples_split': np.arange(2,11),
  'classifier__min_samples_leaf': np.arange(1,5)
}

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', verbose=1, n_jobs=-1, random_state=42)

# 학습
random_search.fit(X_train, y_train)

print("가장 좋은 파라미터 : ", random_search.best_params_)  

Fitting 5 folds for each of 10 candidates, totalling 50 fits
가장 좋은 파라미터 :  {'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': 3}


In [5]:
# 예측값 및 평가지표 
best_estimator = random_search.best_estimator_
y_pred = best_estimator.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy : ", accuracy)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 : ", f1)

Accuracy :  0.673469387755102
Precision :  0.6944444444444444
Recall :  0.8333333333333334
F1 :  0.7575757575757577


In [6]:
results = pd.DataFrame(random_search.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__min_samples_split,param_classifier__min_samples_leaf,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.013201,0.001327,0.0068,0.0004,8,3,6,"{'classifier__min_samples_split': 8, 'classifi...",0.538462,0.538462,0.435897,0.692308,0.435897,0.528205,0.094002,6
1,0.0138,0.00075,0.006001,0.000632,2,1,6,"{'classifier__min_samples_split': 2, 'classifi...",0.538462,0.615385,0.487179,0.692308,0.487179,0.564103,0.079446,2
2,0.011203,0.001469,0.004599,0.0008,4,4,6,"{'classifier__min_samples_split': 4, 'classifi...",0.564103,0.487179,0.435897,0.615385,0.461538,0.512821,0.066864,9
3,0.011601,0.001855,0.007,0.001673,2,2,3,"{'classifier__min_samples_split': 2, 'classifi...",0.487179,0.666667,0.564103,0.641026,0.564103,0.584615,0.063639,1
4,0.0122,0.00172,0.006601,0.000488,2,1,8,"{'classifier__min_samples_split': 2, 'classifi...",0.512821,0.538462,0.435897,0.666667,0.512821,0.533333,0.075019,4
5,0.011001,0.000633,0.006,2e-06,2,3,6,"{'classifier__min_samples_split': 2, 'classifi...",0.538462,0.538462,0.435897,0.717949,0.461538,0.538462,0.098643,3
6,0.011803,0.000981,0.006197,0.000748,5,1,9,"{'classifier__min_samples_split': 5, 'classifi...",0.564103,0.538462,0.461538,0.692308,0.410256,0.533333,0.096487,4
7,0.0118,0.000401,0.006198,0.000401,9,2,8,"{'classifier__min_samples_split': 9, 'classifi...",0.564103,0.538462,0.384615,0.589744,0.564103,0.528205,0.073604,6
8,0.011798,0.001325,0.006201,0.000749,7,2,10,"{'classifier__min_samples_split': 7, 'classifi...",0.589744,0.564103,0.461538,0.564103,0.461538,0.528205,0.055232,6
9,0.0108,0.000749,0.0058,0.0004,2,1,7,"{'classifier__min_samples_split': 2, 'classifi...",0.538462,0.461538,0.538462,0.615385,0.384615,0.507692,0.078446,10
