In [45]:
import seaborn as sns
import pandas as pd

titanic = sns.load_dataset("titanic")
df = pd.DataFrame(data=titanic)
# # 데이터프레임 내의 모든 bool 타입의 원소를 찾아 숫자형으로 변환
# df = df.map(lambda x: int(x) if type(x) == bool else x)
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


# 칼럼 설명

1. survived
- 생존 여부
- 0이면 사망, 1이면 생존

2. pclass
- 객실 등급
- 1이면 1등급, 2이면 2등급, 3이면 3등급

3. sex
- 성별
- male이면 남자, female이면 여자

4. age
- 나이

5. sibsp
- 함께 탑승한 형제 및 배우자 수

6. parch
- 함께 탑승한 자녀 및 부모 수

7. fare
- 요금

8. embarked
- 탑승지 이름 앞글자
- C는 Cherbourg, Q는 Queenstown, S는 Southampton

9. class
- 객실 등급
- First면 1등급, Second면 2등급, Third면 3등급

10. who
- 남자, 여자, 아이
- man, woman, child

11. adult_male
- 성인 남자인지 여부
- True면 성인 남자, False면 그외

12. deck
- 선실 번호 첫 알파벳
- A, B, C, D, E, F, G

13. embark_town
- 탑승지 이름
- Cherbourg, Queenstown, Southampton

14. alive
- 생존여부
- no면 사망, yes면 생존

15. alone
- 혼자 탑승했는지 여부
- True면 혼자 탑승, False면 가족과 함께 탑승

In [46]:
# 데이터 타입 확인
print(df.dtypes)

# 범주형 데이터만 선택하기
cat_df = df.select_dtypes(include=['object', 'category'])

# 수치형 데이터만 선택하기
num_df = df.select_dtypes(include=['int64', 'float64'])

In [47]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def dummy(data, col):
    lab = LabelEncoder()  # 범주형 자료를 수치화
    aa = lab.fit_transform(data[col]).reshape(-1, 1)
    one = OneHotEncoder(sparse_output=False)  # 가변수화
    column_names = [col + '_' + str(i) for i in lab.classes_]
    return pd.DataFrame(one.fit_transform(aa), columns=column_names)

In [60]:
# 모든 범주형 데이터에 대해 더미 변수 생성 후 합치기
temp_dfs = [num_df]  # 수치형 데이터 프레임을 먼저 추가
for column in cat_df:
    temp_dfs.append(dummy(df, column))  # 각 범주형 변수에 대해 생성된 더미 변수 데이터프레임을 추가

new_df = pd.concat(temp_dfs, axis=1)  # 모든 데이터프레임을 합침
new_df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,...,deck_E,deck_F,deck_G,deck_nan,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_nan,alive_no,alive_yes
0,0,3,22.0,1,0,7.25,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1,3,26.0,0,0,7.925,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0,3,35.0,0,0,8.05,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [63]:
from sklearn.model_selection import train_test_split

target = ['alive_no', 'alive_yes']  # 타겟 열 이름을 리스트로 지정 # df['alive_no', 'alive_yes'] X
features = new_df.drop(columns = target).columns

train, test = train_test_split(new_df, test_size = 0.3, random_state = 2)

X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [64]:
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2

# Null값의 처리를 위해 SimpleImputer 처리를 먼저 한다.
sim = SimpleImputer()
X_train_sim = sim.fit_transform(X_train, y_train)
X_test_sim = sim.transform(X_test)

# 특성 선택을 위한 SelectKBest를 진행 - K는 특성 수
selector = SelectKBest(score_func = chi2, k = 10)
X_train_selected = selector.fit_transform(X_train_sim, y_train)
X_test_selected = selector.transform(X_test_sim)

# 선택된 column의 이름을 확인하는 작업
all_names = X_train.columns
selected_mask = selector.get_support()
selected_name = all_names[selected_mask]
selected_name

Index(['survived', 'pclass', 'fare', 'sex_female', 'sex_male', 'class_First',
       'class_Third', 'who_man', 'who_woman', 'deck_B'],
      dtype='object')