## Task1_0725. 타이타닉 생존자 예측 데이터 세트 train.csv에 대하여 다음 사항을 수행하세요.
- 일괄 전처리 사용자 함수 transform_features(df) 작성
- 분류 모델 학습 및 평가 사용자 함수 작성
- dt, lr, rf 모델링 및 평가(roc auc 포함)
  
==========================================================
- GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.
  - Decision Tree, Random Forest, Logistic Regression 모델별 수행
  - 선택한 모델에 적합한 parameter grid 적용
  - cv=5 적용

In [1]:
import warnings
import pandas as pd

# FutureWarning 경고 메시지를 무시하도록 설정
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

file_path = r"D:\kdt_240424\workspace\M5_ML\data\train.csv"
df = pd.read_csv(file_path)

### 일괄 전처리 사용자 함수 transform_features(df) 작성


In [3]:
# 일괄 전처리 사용자 함수 transform_features(df) 작성
from sklearn.model_selection import train_test_split


def transform_features(df):
    df.drop(columns=["PassengerId", "Ticket", "Cabin"], inplace=True)

    def get_title(name):
        if "Mr." in name:
            return "Mr"
        elif "Miss." in name:
            return "Miss"
        elif "Mrs." in name:
            return "Mrs"
        else:
            return "Other"

    # 타이틀 열 추가
    df["Title"] = df["Name"].apply(get_title)
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

    bins = [0, 5, 12, 18, 27, 33, 60, 100]
    # [0,5,12,18,35,60,100]
    labels = ["Infant", "Child", "Teenager", "y y adult", " Young Adult", "Adul", "Senior"]
    df["Age_group"] = pd.cut(df["Age"], bins=bins, labels=labels)
    df.drop(columns=["Age"], inplace=True)

    fare_bins = [0, 30, 100, 600]
    fare_labels = ["Low", "Medium", "High"]
    df["Fare_group"] = pd.cut(df["Fare"], bins=fare_bins, labels=fare_labels)
    df.drop(columns=["Fare"], inplace=True)

    df["Family_size"] = df["SibSp"] + df["Parch"] + 1
    df["family_male"] = ((df["Family_size"] > 6) & (df["Sex"] == "male")).astype(int)
    df["mr_male"] = ((df["Title"] == "Mr")).astype(int)
    # df['mrs_female'] = ((df['Title'] == 'Miss')).astype(int)
    df["others"] = ((df["Title"] == "Other")).astype(int)

    df["family_female"] = ((df["Family_size"] > 3) & (df["Sex"] == "female")).astype(int)

    df.drop(columns=["SibSp", "Parch"], inplace=True)
    df.drop(columns=["Name"], inplace=True)
    df.drop(columns=["Title"], inplace=True)
    categorical_columns = ["Age_group", "Fare_group", "Sex", "Embarked"]
    for column in categorical_columns:
        df = pd.get_dummies(df, columns=[column])
    X = df.drop("Survived", axis=1)
    y = df["Survived"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = transform_features(df)

### 분류 모델 학습 및 평가 사용자 함수 작성


In [12]:
# 분류 모델 학습 및 평가 사용자 함수 작성

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# 사용자 평가 함수 정의
def get_clf_eval(y_test, pred, pred_proba=0):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    # ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print("오차 행렬")
    print(confusion)
    # ROC-AUC print 추가
    print(
        f"평가 함수 결과 :\n정확도 : {accuracy:.4f}, 정밀도 : {precision:.4f}, 재현율 : {recall:.4f}, F1 : {f1:.4f}, ROC AUC : {roc_auc:.4f}"
    )


# 분류 모델 학습
# 결정트리

dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
pred_proba = dt_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[4328  211]
 [ 737  727]]
평가 함수 결과 :
정확도 : 0.8421, 정밀도 : 0.7751, 재현율 : 0.4966, F1 : 0.6053, ROC AUC : 0.8449


In [28]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=7)
knn_clf.fit(X_train, y_train)
pred = knn_clf.predict(X_test)
pred_proba = knn_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[136  21]
 [ 32  79]]
평가 함수 결과 :
정확도 : 0.8022, 정밀도 : 0.7900, 재현율 : 0.7117, F1 : 0.7488, ROC AUC : 0.8736


In [35]:
# SVM

from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', C=1.0, random_state=42)
svm_clf.fit(X_train, y_train)
pred = svm_clf.predict(X_test)
pred_proba = svm_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[139  18]
 [ 34  77]]
평가 함수 결과 :
정확도 : 0.8060, 정밀도 : 0.8105, 재현율 : 0.6937, F1 : 0.7476, ROC AUC : 0.8859


In [36]:
# random forest

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf_clf.fit(X_test, y_test)
pred = rf_clf.predict(X_test)
pred_proba = rf_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[148   9]
 [ 10 101]]
평가 함수 결과 :
정확도 : 0.9291, 정밀도 : 0.9182, 재현율 : 0.9099, F1 : 0.9140, ROC AUC : 0.9678


In [37]:
# logistic regression

from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
lr_clf.fit(X_test, y_test)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[136  21]
 [ 26  85]]
평가 함수 결과 :
정확도 : 0.8246, 정밀도 : 0.8019, 재현율 : 0.7658, F1 : 0.7834, ROC AUC : 0.8914


## GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.

### Decision Tree


In [7]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 7],
    "min_samples_split": [30, 50, 70],
    "min_samples_leaf": [3, 5, 10],
    "max_features": [3, 5, 10],
    "max_leaf_nodes": [3, 5, 10],
}

dt_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_dt = grid_search.best_estimator_
pred = best_dt.predict(X_test)
pred_proba = best_dt.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
best parameters found : {'criterion': 'entropy', 'max_depth': 5, 'max_features': 10, 'max_leaf_nodes': 10, 'min_samples_leaf': 10, 'min_samples_split': 30}
오차 행렬
[[144  13]
 [ 39  72]]
평가 함수 결과 :
정확도 : 0.8060, 정밀도 : 0.8471, 재현율 : 0.6486, F1 : 0.7347, ROC AUC : 0.8888


### Random Forest

In [11]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "n_estimators": [300, 400, 500,],
    "max_depth": [3, 5, 7 ],
    "min_samples_split": [2,3,5  ],
    "min_samples_leaf": [1, 2, 3 ],
    "max_features": [10, 15, 20, 'sqrt', 'log2'],
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_test, y_test)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_rf = grid_search.best_estimator_
pred = best_rf.predict(X_test)
pred_proba = best_rf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
best parameters found : {'max_depth': 7, 'max_features': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
오차 행렬
[[144  13]
 [ 10 101]]
평가 함수 결과 :
정확도 : 0.9142, 정밀도 : 0.8860, 재현율 : 0.9099, F1 : 0.8978, ROC AUC : 0.9624


In [26]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "n_estimators": [
        50,
        100,
        200,
    ],
    "max_depth": [3, 5, 7,9],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [1, 2, 3],
    "max_features": [ 10,15, 20, "sqrt", "log2"],
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_test, y_test)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_rf = grid_search.best_estimator_
pred = best_rf.predict(X_test)
pred_proba = best_rf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
best parameters found : {'max_depth': 7, 'max_features': 15, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
오차 행렬
[[144  13]
 [ 12  99]]
평가 함수 결과 :
정확도 : 0.9067, 정밀도 : 0.8839, 재현율 : 0.8919, F1 : 0.8879, ROC AUC : 0.9541


### Logistic Regression

In [23]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grids = {
    "max_iter": [3,5, 10,20, 30, 50, 100, 300, 500],
    "solver": ["lbfgs", "liblinear", "newton-cg"],
    "C": [3,5,7,9],
    "penalty": ["l1", "l2", "elasticnet"],
}

lr_clr = LogisticRegression(random_state=42)
grid_search = GridSearchCV(lr_clr, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_test, y_test)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_lr = grid_search.best_estimator_
pred = best_lr.predict(X_test)
pred_proba = best_lr.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
best parameters found : {'C': 5, 'max_iter': 5, 'penalty': 'l2', 'solver': 'liblinear'}
오차 행렬
[[137  20]
 [ 25  86]]
평가 함수 결과 :
정확도 : 0.8321, 정밀도 : 0.8113, 재현율 : 0.7748, F1 : 0.7926, ROC AUC : 0.8946


## ex


In [None]:
from sklearn.preprocessing import LabelEncoder


# Null 처리 함수
def fillna(df):
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    df["Cabin"].fillna("N", inplace=True)
    df["Embarked"].fillna("N", inplace=True)
    df["Fare"].fillna(0, inplace=True)
    return df


# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
    return df


# 레이블 인코딩 수행.
def format_features(df):
    df["Cabin"] = df["Cabin"].str[:1]
    features = ["Cabin", "Sex", "Embarked"]
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df


# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [None]:
# 원본 데이터를 재로딩 하고, feature데이터 셋과 Label 데이터 셋 추출.

y_titanic_df = titanic_df["Survived"]
X_titanic_df = titanic_df.drop("Survived", axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)
X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.2, random_state=11, stratify=y_titanic_df
)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 결정트리, Random Forest, 로지스틱 회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=10)
rf_clf = RandomForestClassifier(random_state=10)
lr_clf = LogisticRegression(max_iter=2000, random_state=10)
print("dt_clf 학습")
print("=" * 12)
train_and_evaluate(dt_clf, X_train, X_test, y_train, y_test)
print("rf_clf 학습")
print("=" * 12)
train_and_evaluate(rf_clf, X_train, X_test, y_train, y_test)
print("lr_clf 학습")
print("=" * 12)
train_and_evaluate(lr_clf, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "max_depth": [2, 3, 5, 10, 12],
    "min_samples_split": [2, 3, 5],
    "min_samples_leaf": [1, 5, 8, 10],
}

grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, scoring="accuracy", cv=5)
grid_dclf.fit(X_train, y_train)

print("GridSearchCV 최적 하이퍼 파라미터 :", grid_dclf.best_params_)
print("GridSearchCV 최고 정확도: {0:.4f}".format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

train_and_evaluate(best_dclf, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {"C": [0.1, 1, 10, 50, 100]}
grid_lrclf = GridSearchCV(lr_clf, param_grid=param_grid, cv=5, verbose=0)
grid_lrclf.fit(X_train, y_train)

print("GridSearchCV 최적 하이퍼 파라미터 :", grid_lrclf.best_params_)
print("GridSearchCV 최고 정확도: {0:.4f}".format(grid_lrclf.best_score_))
best_lrclf = grid_lrclf.best_estimator_

train_and_evaluate(best_lrclf, X_train, X_test, y_train, y_test)

Task3_0725. 데이터셋 개선, 오늘 배운 모델 적용, 탐색적분석을 통한 파생변수 적용하고 설명

In [15]:
# 데이터셋 개선
import pandas as pd

data = pd.read_csv(r"D:\kdt_240424\workspace\M5_ML\data\adult_incomes.csv")
data.dropna(inplace=True)
# 이상치 제거 data['capital-gain'] max값 제거
data = data[data["capital-gain"] < 99990]

# 파생변수 작성
data["capital_diff"] = data["capital-gain"] - data["capital-loss"]

In [16]:
ages = data.age.values
category = ["teenager", "young adult", "adult", "elderly"]
data["age_cat"] = pd.cut(ages, bins=[17, 28, 37, 47, 90], labels=category)

In [17]:
# 범주형 변수 인코딩
categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    "income",
    "age_cat",
]

data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [18]:
# 변수 선택및 독립변수 , 종속변수 분리
X = data.drop("income_>50K", axis=1)
y = data["income_>50K"]

In [19]:
# 데이터 표준화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [20]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
print(X_test)

이전 결과가 가장 좋았던 랜덤 포레스트를 기준으로 하이퍼 파라미터 튜닝 진행

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
pred_proba = rf_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[4218  321]
 [ 555  909]]
평가 함수 결과 :
정확도 : 0.8541, 정밀도 : 0.7390, 재현율 : 0.6209, F1 : 0.6748, ROC AUC : 0.9031


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "n_estimators": [
        300,
        400,
        500,
    ],
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 3, 5],
    "min_samples_leaf": [1, 2, 3],
    "max_features": [10, 15, 20, "sqrt", "log2"],
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_rf = grid_search.best_estimator_
pred = best_rf.predict(X_test)
pred_proba = best_rf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)