In [130]:
# 라이브러리 import
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [142]:
# 데이터셋 불러오기
def get_data() -> tuple:
    train_df = pd.read_csv('./train.csv')
    test_df = pd.read_csv('./test.csv')
    return train_df, test_df

train_df, test_df = get_data()

In [132]:
# 데이터 확인
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [133]:
# 데이터 info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [134]:
# 데이터 describe
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### 1. 기본 전처리

In [111]:
# Embarked 경우 2개의 적은 결측치로 인해, 가장 많은 Embarked로 fillna
# 가장 많은 Embarked 확인
train_df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [112]:
# 데이터 전처리 함수 선언
# Embarked fillna / Cabin은 결측치가 너무 많으므로 drop / Ticket과 passengerId는 생존여부와 크게 관계없으므로 drop
# Age의 결측치는 평균으로 fillna
def titanic_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Cabin', 'PassengerId', 'Ticket'], axis=1)
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    return df

In [113]:
train_preprocess_df = titanic_preprocessing(train_df)
train_preprocess_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [114]:
# 여기서 Name의 경우 의미 없다고 가정하고 Name drop
# object 타입 원핫 인코딩
def titanic_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Cabin', 'PassengerId', 'Ticket'], axis=1)
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df = df.drop('Name', axis=1)
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
    return df

In [115]:
train_preprocess_df = titanic_preprocessing(train_df)
test_preprocess_df = titanic_preprocessing(test_df)
train_preprocess_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,True,False,True
1,1,1,38.000000,1,0,71.2833,False,False,False
2,1,3,26.000000,0,0,7.9250,False,False,True
3,1,1,35.000000,1,0,53.1000,False,False,True
4,0,3,35.000000,0,0,8.0500,True,False,True
...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,True,False,True
887,1,1,19.000000,0,0,30.0000,False,False,True
888,0,3,29.699118,1,2,23.4500,False,False,True
889,1,1,26.000000,0,0,30.0000,True,False,False


In [116]:
# 첫번째 테스트
# feature, label 분리
y_labels = train_preprocess_df['Survived']
X_features = train_preprocess_df.drop('Survived', axis=1)

In [117]:
# train, test 셋 분리
X_train, X_test, y_train, y_test = train_test_split(
    X_features,
    y_labels,
    test_size=0.2,
    random_state=55
)

In [118]:
# 분류기 생성
xgb_clf = XGBClassifier(
    n_estimators=100
)

In [119]:
# 데이터 학습
xgb_clf.fit(X_train, y_train)

# 예측
predict = xgb_clf.predict(X_test)
score = accuracy_score(y_test, predict)
print(f'Accuracy : {score:.4f}')

Accuracy : 0.8156


### 2. 이름에서 Mr. Mrs. Miss. Master.등 구분 추출

In [120]:
### 2. 이름에서 Mr. Mrs. Miss. Master.등 구분 추출
train_df['ExtractName'] = train_df.Name.str.extract('([A-Za-z]+)\.', expand=False)
pd.crosstab(train_df['ExtractName'], train_df['Sex'])

Sex,female,male
ExtractName,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [121]:
# 테스트 데이터도 확인
test_df['ExtractName'] = test_df.Name.str.extract('([A-Za-z]+)\.', expand=False)
pd.crosstab(test_df['ExtractName'], test_df['Sex'])

Sex,female,male
ExtractName,Unnamed: 1_level_1,Unnamed: 2_level_1
Col,0,2
Dona,1,0
Dr,0,1
Master,0,21
Miss,78,0
Mr,0,240
Mrs,72,0
Ms,1,0
Rev,0,2


In [122]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ExtractName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [123]:
# Master, Miss, Mr, Mrs 등을 제외하고는 데이터가 적으므로 Others로 대체, 같은 의미를 뜻하는 Mlle->Miss, Ms->Miss, Mme->Mrs 변경
train_df['ExtractName'] = train_df['ExtractName'].replace(
    ['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Rev', 'Sir', 'Dona'], 'Others'
    )
train_df['ExtractName'] = train_df['ExtractName'].replace(['Mlle', 'Ms'], 'Miss')
train_df['ExtractName'] = train_df['ExtractName'].replace('Mme', 'Mrs')

# 각 추출이름으로 생존률 확인
train_df[['ExtractName', 'Survived']].groupby('ExtractName', as_index=False).mean()

Unnamed: 0,ExtractName,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Others,0.347826


In [152]:
# 전처리 과정
def titanic_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['ExtractName'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    df['ExtractName'] = df['ExtractName'].replace(
    ['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Rev', 'Sir', 'Dona'], 'Others'
    )
    df['ExtractName'] = df['ExtractName'].replace(['Mlle', 'Ms'], 'Miss')
    df['ExtractName'] = df['ExtractName'].replace('Mme', 'Mrs')
    df = df.drop(['Cabin', 'PassengerId', 'Ticket', 'Name'], axis=1)
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'ExtractName'], drop_first=True)
    return df

In [153]:
train_df, test_df = get_data()

train_preprocess_df = titanic_preprocessing(train_df)
test_preprocess_df = titanic_preprocessing(test_df)
train_preprocess_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,ExtractName_Miss,ExtractName_Mr,ExtractName_Mrs,ExtractName_Others
0,0,3,22.000000,1,0,7.2500,True,False,True,False,True,False,False
1,1,1,38.000000,1,0,71.2833,False,False,False,False,False,True,False
2,1,3,26.000000,0,0,7.9250,False,False,True,True,False,False,False
3,1,1,35.000000,1,0,53.1000,False,False,True,False,False,True,False
4,0,3,35.000000,0,0,8.0500,True,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,True,False,True,False,False,False,True
887,1,1,19.000000,0,0,30.0000,False,False,True,True,False,False,False
888,0,3,29.699118,1,2,23.4500,False,False,True,True,False,False,False
889,1,1,26.000000,0,0,30.0000,True,False,False,False,True,False,False


In [154]:
# train 셋 분리 및 분류기 생성/학습/예측 함수 만들기 => 학습된 분류기 반환
def process(preprocess_df: pd.DataFrame):
    # feature, label 분리
    y_labels = preprocess_df['Survived']
    X_features = preprocess_df.drop('Survived', axis=1)
    # train, test 분리
    X_train, X_test, y_train, y_test = train_test_split(
        X_features,
        y_labels,
        test_size=0.2,
        random_state=55
    )
    # 분류기 생성
    xgb_clf = XGBClassifier(
        n_estimators=100
    )
    # 데이터 학습
    xgb_clf.fit(X_train, y_train)
    # 예측
    predict = xgb_clf.predict(X_test)
    score = accuracy_score(y_test, predict)
    print(f'Accuracy : {score:.4f}')
    
    return xgb_clf

In [158]:
# 두번째 테스트
model = process(train_preprocess_df)

Accuracy : 0.8324


### 3. 나이 분류 전처리

In [163]:
# 데이터 불러오기
train_df, test_df = get_data()

In [164]:
train_preprocess_df = titanic_preprocessing(train_df)
train_preprocess_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,ExtractName_Miss,ExtractName_Mr,ExtractName_Mrs,ExtractName_Others
0,0,3,22.000000,1,0,7.2500,True,False,True,False,True,False,False
1,1,1,38.000000,1,0,71.2833,False,False,False,False,False,True,False
2,1,3,26.000000,0,0,7.9250,False,False,True,True,False,False,False
3,1,1,35.000000,1,0,53.1000,False,False,True,False,False,True,False
4,0,3,35.000000,0,0,8.0500,True,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,True,False,True,False,False,False,True
887,1,1,19.000000,0,0,30.0000,False,False,True,True,False,False,False
888,0,3,29.699118,1,2,23.4500,False,False,True,True,False,False,False
889,1,1,26.000000,0,0,30.0000,True,False,False,False,True,False,False


In [None]:
# Age의

In [159]:
# 캐글 확인 위한 test_df 예측
predict_test = model.predict(test_preprocess_df)

In [160]:
# 답지 불러오고 답지에 예측값 넣고 결과값 다시 저장
result = pd.read_csv('./gender_submission.csv')
result['Survived'] = predict_test

result.to_csv('./result.csv', index=False)