<a href="https://colab.research.google.com/github/hyunicecream/ML-DL/blob/main/LGB_(for%20Classification)(titanic_data).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:

df = pd.read_csv('titanic.csv')
df.head()

/content/drive/MyDrive/Private_room/data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# 결측치 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
# Mr, Miss, Mrs 등 호칭으로 분류
name = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
name.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Mme           1
Don           1
Lady          1
Capt          1
Jonkheer      1
Sir           1
Ms            1
Countess      1
Name: Name, dtype: int64

In [None]:
# 4개의 호칭에 대해서만 사용하도록 하자
# Name 이라는 열을 지우고 Title이라는 것으로 만들어서 사용할 것이다. 
title = ['Mr', 'Miss', 'Mrs', 'Master']
df['Title'] = [x if x in title else 'Other' for x in name]

In [None]:
# LabelEncoder 진행
le = {}
for feat in ['Fare', 'Cabin', 'Sex', 'Embarked', 'Title']:
    le[feat] = LabelEncoder()
    df[feat] = le[feat].fit_transform(df[feat].astype(str))
print(title)
print(df['Title'])

['Mr', 'Miss', 'Mrs', 'Master']
0      2
1      3
2      1
3      3
4      2
      ..
886    4
887    1
888    1
889    2
890    2
Name: Title, Length: 891, dtype: int64


In [None]:
# 불필요한 feature를 제거한다.
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,1,22.0,1,0,174,147,2,2
1,1,1,0,38.0,1,0,199,81,0,3
2,1,3,0,26.0,0,0,197,147,2,1
3,1,1,0,35.0,1,0,142,55,2,3
4,0,3,1,35.0,0,0,211,147,2,2


In [None]:
# features 와 target 값을 구성한다.
target_data = df['Survived']
feature_data = df.drop('Survived', axis=1)

In [None]:
# train 셋과 test셋을 구성한다.
trainX, testX, trainY, testY = train_test_split(feature_data, target_data, test_size=0.2)

In [None]:
# LGB Model로 학습을 한다.
model = LGBMClassifier(n_estimators = 100, boosting="goss", top_rate=0.2, other_rate=0.1)
model.fit(trainX, trainY)

LGBMClassifier(boosting='goss', other_rate=0.1, top_rate=0.2)

In [None]:
predY = model.predict(testX)
accuracy = (testY == predY).mean()
print("LGBClassifier")
print()
print("* 시험용 데이터로 측정한 정확도 = %.2f" % accuracy)

predY = model.predict(trainX)
accuracy = (trainY == predY).mean()
print()
print("* 학습용 데이터로 측정한 정확도 = %.2f" % accuracy)
print()
print('* 시험 데이터 전체 오류 (R2-score) = %.4f' % model.score(testX, testY))

predY = model.predict_proba(testX)[:, 1]
auc = roc_auc_score(testY, predY)
print()
print("* ROC AUC = {0:.4f}".format(auc))

print()
print('='* 50)
print()
print("XGBClassifier")

# XGBoost와 비교해 본다.
from xgboost import XGBClassifier

x_train, x_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=0.2)

model = XGBClassifier(objective='binary:logistic',)
model.fit(x_train, y_train)

# 정확도 확인
pred = model.predict(x_test)
accuracy = (y_test == pred).mean()
print()
print("* 시험용 데이터로 측정한 정확도 = %.2f" % accuracy)

pred = model.predict(x_train)
accuracy = (y_train == pred).mean()
print()
print("* 학습용 데이터로 측정한 정확도 = %.2f" % accuracy)
print()
print('* 시험 데이터 전체 오류 (R2-score) = %.4f' % model.score(x_test, y_test))

pred = model.predict_proba(x_test)[:, 1]
auc = roc_auc_score(y_test, pred)
print()
print("* ROC AUC = {0:.4f}".format(auc))

LGBClassifier

* 시험용 데이터로 측정한 정확도 = 0.88

* 학습용 데이터로 측정한 정확도 = 0.89

* 시험 데이터 전체 오류 (R2-score) = 0.8827

* ROC AUC = 0.9100


XGBClassifier

* 시험용 데이터로 측정한 정확도 = 0.83

* 학습용 데이터로 측정한 정확도 = 0.89

* 시험 데이터 전체 오류 (R2-score) = 0.8268

* ROC AUC = 0.8524
