In [1]:
import lightgbm

print(lightgbm.__version__)

4.3.0


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

titanic_df = pd.read_csv('./titanic/titanic_train.csv')
y = titanic_df["Survived"]
feature_name = titanic_df.columns
titanic_df.head(3)

X_predict = pd.read_csv('./titanic/test.csv')
gender_submission = pd.read_csv('./titanic/gender_submission.csv')

In [2]:
def fillna(df):
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #missing_values=np.nan 모든 결측값을 대체한다. / strategy='mean' 평균으로 대체한다.
    age_array = df['Age'].to_numpy().reshape(-1, 1)
    imputer.fit(age_array)
    df['Age'] = imputer.transform(age_array)
    return df


def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    print(df.head(3), "\n\n")

    df = df.drop('Survived', axis=1, inplace=False)
    return df


# 레이블
def encode_features_label(df):
    features = ['Cabin', 'Sex', 'Embarked']
    le = LabelEncoder()
    for feature in features:
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df



# 표준화 => 2차원 데이터
def stscaler(df):
    features = ['Age', 'Fare']
    sc = StandardScaler()
    for feature in features:
        df[[feature]] = sc.fit_transform(df[[feature]])
    return df
    

# # 원핫
# # one-hot 인코딩을 하면 많은 0 값을 포함하기 때문에, 이러한 데이터를 희소 행렬 형식으로 저장하면 메모리 사용량을 크게 줄일 수 있다.
# # ColumnTransformer는 인코딩된 데이터를 OneHotEncoder를 통해 처리한 후에 희소 행렬 형태로 반환
# def encode_features_onehot(df):
#     ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 6, 7])], remainder='passthrough')
#     df = ct.fit_transform(df)
#     # df = pd.DataFrame(df.toarray()) # 희소 행렬을 Dense 형태로 변환 후 DataFrame으로 변환
#     return df


titanic_df = fillna(titanic_df)
titanic_df = drop_features(titanic_df)

titanic_df = stscaler(titanic_df)

# titanic_df = encode_features_onehot(titanic_df)
titanic_df = encode_features_label(titanic_df)
print(titanic_df[:3])


# 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
X_train, X_test, y_train, y_test = train_test_split(titanic_df, y, test_size=0.2, random_state=156 )

# 위에서 만든 X_train, y_train을 다시 쪼개서 90%는 학습과 10%는 검증용 데이터로 분리 
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156 )

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500     N        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250     N        S 


   Pclass  Sex       Age  SibSp  Parch      Fare  Cabin  Embarked
0       3    1 -0.592481      1      0 -0.502445    146         3
1       1    0  0.638789      1      0  0.786845     81         0
2       3    0 -0.284663      0      0 -0.488854    146         3


In [3]:
from lightgbm import early_stopping
from lightgbm import LGBMClassifier

In [4]:
lgbm_wrapper = LGBMClassifier(n_estimators=400, learning_rate=0.05)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_tr, y_tr), (X_val, y_val)]
lgbm_wrapper.fit(X_tr, y_tr, callbacks=[early_stopping(stopping_rounds=50)], eval_metric="logloss", 
                 eval_set=evals)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

[LightGBM] [Info] Number of positive: 246, number of negative: 394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 238
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384375 -> initscore=-0.471019
[LightGBM] [Info] Start training from score -0.471019
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[89]	training's binary_logloss: 0.252519	valid_1's binary_logloss: 0.343744


In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [7]:
get_clf_eval(y_test, preds, pred_proba)

오차 행렬
[[96 11]
 [21 51]]
정확도: 0.8212, 정밀도: 0.8226, 재현율: 0.7083,    F1: 0.7612, AUC:0.8760


In [None]:
result = lgbm_wrapper.predict(X_predict)