In [None]:
!pip install pycaret==3.0.0

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from pycaret.classification import *

In [None]:
data_df = pd.read_csv('titanic.csv')
data_df2 = data_df.copy()
data_df.head(3)

In [None]:
# 카테고리 널값 Z로 채우기
data_df['Cabin'] = data_df['Cabin'].fillna('Z/9999/Z')

# '/'를 기준으로 'Cabin' 데이터 문자열 분할
split_data = data_df['Cabin'].str.split('/', expand=True)

## 컬럼 이름 변경 deck/num/side
split_data.columns = ['Deck', 'Num', 'Side']

# 분할된 데이터프레임과 원본 데이터프레임 병합
data_df = pd.concat([data_df, split_data], axis=1)

data_df.head()

In [None]:
# passengerId gggg_xx => gggg 그룹id 로 변환
data_df['PassengerId'] = data_df['PassengerId'].str.slice(0,4)

# 변환한 id 그룹 데이터가 2개이상인 경우
grouped=data_df.groupby('PassengerId')
grouped_count = grouped.size()
index_list = grouped_count[grouped_count >= 2].index.tolist()
condition= data_df['PassengerId'].isin(index_list)

# 같은그룹인데 출발지가 비어있는경우 92/201
print(data_df[condition][data_df[condition].HomePlanet.isnull()],'\n')
# 같은그룹인데 목적지가 비어있는경우 79/182
print(data_df[condition][data_df[condition].Destination.isnull()])

In [None]:
#passengerId가 같은데 homeplanet이 다른경우가 있는지 체크 => 다같음 확인
data_df
passenger_homeplanet={}
for i,row in data_df.iterrows():
    if not pd.isna(row.HomePlanet):
        if row.PassengerId in passenger_homeplanet and row.HomePlanet not in passenger_homeplanet[row.PassengerId]:
            passenger_homeplanet[row.PassengerId].append(row.HomePlanet)
        else:
            passenger_homeplanet[row.PassengerId]=[row.HomePlanet]

result = {key: value for key, value in passenger_homeplanet.items() if len(value) >= 2}
print("homeplanet 다른 경우 =>",result)

In [None]:
deck_hoemplanet_dict ={'A':'Europa','B':'Europa','C':'Europa','D':'Mars','E':'Earth','F':'Earth','G':'Earth'}
def hoemplanet_pre_processing(df):

    if pd.isna(df.HomePlanet):
        #1순위
        if df.PassengerId in passenger_homeplanet:
            return passenger_homeplanet[df.PassengerId][0]
        #2순위
        elif not pd.isna(df.Cabin) and df.Cabin.split('/')[2] in deck_hoemplanet_dict:
            return deck_hoemplanet_dict[df.Cabin.str.split('/')[2]]
        #3순위
        else :
            return 'Earth'
    else: return df.HomePlanet


data_df['HomePlanet'] = data_df.apply(hoemplanet_pre_processing, axis=1)


In [None]:
# 목적지는 최빈값으로 채우기
data_df['Destination'] = data_df.apply(lambda x: 'TRAPPIST-1e' if pd.isna(x.Destination) else x.Destination, axis=1)

In [None]:
# CryoSleep 결측치처리

##### 비용 지불 5개 항목의 결측치를 0으로 입력

data_df.RoomService.fillna(0, inplace=True)
data_df.FoodCourt.fillna(0, inplace=True)
data_df.ShoppingMall.fillna(0, inplace=True)
data_df.Spa.fillna(0, inplace=True)
data_df.VRDeck.fillna(0, inplace=True)
data_df.info()

##### ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'] 합계 열 추가

data_df['sum_5'] = data_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
data_df.head()


In [None]:
drop_cols = ["PassengerId", "Cabin", "Name"]
numeric_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

In [None]:
from sklearn.impute import KNNImputer

imp = KNNImputer(n_neighbors=5)
data = imp.fit_transform(data_df[numeric_cols]) # KNN Imputation for numeric features
# imputation 결과가 np.array라 다시 dataframe으로 만들어줌.
_data_df = pd.DataFrame(data=data, columns=numeric_cols)

# imputation한 column을 그에 맞는 위치의 train data에 overwrite.
for num_col in numeric_cols:
    data_df[num_col] = _data_df[num_col] # overwrite with imputed column.

In [None]:
##### 'AgeGroup' 열 추가
data_df['AgeGroup'] = pd.cut(data_df['Age'],right=False, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80], \
                                                            labels=[0, 10, 20, 30, 40, 50, 60, 70])
data_df.AgeGroup.value_counts().sort_index()
data_df.AgeGroup

In [None]:
new_df = data_df.drop(columns=["PassengerId", "Cabin", "Name", "Num", "sum_5", 'Age','VIP'])

new_df['CryoSleep'] = new_df['CryoSleep'].astype(str)

In [None]:
# pycaret setup
model = setup(data=new_df,
              target='Transported',
              categorical_features=['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side', 'AgeGroup'],
              train_size=0.7, # default value
              session_id=9) # Random seed

In [None]:
# 중간 전처리후 top 모델 확인
top_6_models = compare_models(sort='Accuracy', n_select = 6)