# 구글 드라이브 연결

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 시드값 및 데이터 경로

In [3]:
SEED = 42
DATA_PATH = "/content/drive/MyDrive/01-python/data/" # 타이타닉 데이터 있는 본인 경로를 넣으시면 됩니다.

# 전처리

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

df = pd.read_csv(f"{DATA_PATH}titanic.csv")

# 결측치 미리 채우기
df.age = df.age.fillna(df.age.median()) # age 중앙값
df.fare = df.fare.fillna(df.fare.median()) # fare 중앙값
df.cabin = df.cabin.fillna("UNK") # cabin 임의의 문자열로 채우기
df.embarked = df.embarked.fillna(df.embarked.mode()[0]) # embarked 최빈값

# 학습에 바로 사용가능한 특성
cols = ["pclass","age","sibsp","parch","fare"]
features = df[cols]

# 범주형 one-hot encoding
cols = ["gender","embarked"]
enc = OneHotEncoder()
tmp = pd.DataFrame(
    enc.fit_transform(df[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
features = pd.concat([features,tmp],axis=1) # 특성
target = df["survived"] # 정답값
features.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


# 스케일링

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_data = scaler.fit_transform(features)

# holdout 방식으로 학습 및 검증 데이터 분리

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_data, target, random_state=SEED)

# lgbm 모델로 학습

In [7]:
from lightgbm import LGBMClassifier
model = LGBMClassifier(random_state=SEED)
model.fit(x_train,y_train)

LGBMClassifier(random_state=42)

# 검증 데이터에 대해 정확도 평가

In [8]:
from sklearn.metrics import accuracy_score
pred = model.predict(x_valid)
accuracy_score(y_valid,pred)

0.8536585365853658

# 비지도 학습인 군집화 모델을 이용해서 타이타닉 생존자 예측하기 
- KMeans 또는 MiniBatchKMeans 를 이용하여 학습데이터를 학습하고 검증데이터에 생존자를 예측해서 정확도를 출력해주세요

In [82]:
from sklearn.cluster import KMeans
kmeans =KMeans(n_clusters=2 ,random_state=SEED)
kmeans.fit(x_train,y_train)

KMeans(n_clusters=2, random_state=42)

In [77]:
from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import silhouette_score

In [83]:
def print_score(x,y,pred):
    print(f"homogeneity: {homogeneity_score(y_train,pred)}")
    print(f"completeness: {completeness_score(y_train,pred)}")
    print(f"v_measure: {v_measure_score(y_train,pred)}")
    print(f"silhouette: {silhouette_score(x_train,pred)}")

In [84]:
print_score(x_train ,y_train ,kmeans.labels_)

homogeneity: 0.3810036402229152
completeness: 0.3886837795909278
v_measure: 0.3848053927023046
silhouette: 0.45135582432888405


In [85]:
from sklearn.cluster import MiniBatchKMeans
km = MiniBatchKMeans(n_clusters=2 , random_state=SEED,batch_size=512)
km.fit(x_valid ,y_valid) 

MiniBatchKMeans(batch_size=512, n_clusters=2, random_state=42)

In [86]:
print_score(x_train ,y_train ,kmeans.labels_)

homogeneity: 0.3810036402229152
completeness: 0.3886837795909278
v_measure: 0.3848053927023046
silhouette: 0.45135582432888405


In [90]:
from pandas.core.common import random_state
from sklearn.metrics import roc_auc_score
pred = kmeans.predict(x_valid)
roc_auc_score(y_valid,pred)

0.16270779245941025

In [91]:
accuracy_score(y_valid ,pred)

0.1524390243902439