In [1]:
# 데이터 분석에 사용할 라이브러리
import pandas as pd
import numpy as np

In [2]:
DATA_PATH = "../../data/titanic/"

df = pd.read_csv(DATA_PATH+"train.csv")

In [3]:
df.columns = [col.lower() for col in df.columns] # 컬럼명 소문자로 변환
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

# Train, Test 분리

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
SEED = 42

X_tr, X_te = train_test_split(df, random_state=SEED, test_size = 0.2)
X_tr = X_tr.reset_index(drop=True) # X_tr.reset_index(drop=True, inplace=True)
X_te = X_te.reset_index(drop=True)

X_tr.shape, X_te.shape

((712, 12), (179, 12))

# Data Cleansing

In [6]:
# passengerid는 전체 데이터가 unique하기 때문에 삭제
X_tr.drop('passengerid', axis=1, inplace=True)
X_te.drop('passengerid', axis=1, inplace=True)

In [7]:
cabin_mode = X_tr['cabin'].mode().values[0]

X_tr['cabin'] = X_tr['cabin'].fillna(cabin_mode)
X_te['cabin'] = X_te['cabin'].fillna(cabin_mode)

In [8]:
X_tr['age'] = X_tr['age'].fillna(X_tr['age'].median())
X_te['age'] = X_te['age'].fillna(X_tr['age'].median())

In [9]:
embarked_mode = X_tr['embarked'].mode().values[0]

X_tr['embarked'] = X_tr['embarked'].fillna(embarked_mode)
X_te['embarked'] = X_te['embarked'].fillna(embarked_mode)

In [10]:
X_tr.isnull().sum().sum(), X_te.isnull().sum().sum()

(0, 0)

# 데이터 타입 변환

In [11]:
# survived
X_tr["survived"] = X_tr["survived"].astype("int32")
X_te["survived"] = X_te["survived"].astype("int32")

In [12]:
X_tr["pclass"] = X_tr["pclass"].astype("category")
X_te["pclass"] = X_te["pclass"].astype("category")

In [13]:
# age
X_tr["age"] = X_tr["age"].astype("int32")
X_te["age"] = X_te["age"].astype("int32")

In [14]:
X_tr["sibsp"] = X_tr["sibsp"].astype("category")
X_te["sibsp"] = X_te["sibsp"].astype("category")

In [15]:
X_tr["parch"] = X_tr["parch"].astype("category")
X_te["parch"] = X_te["parch"].astype("category")

In [16]:
# fare
X_tr["fare"] = X_tr["fare"].astype("float32")
X_te["fare"] = X_te["fare"].astype("float32")

In [17]:
X_tr["sex"] = X_tr["sex"].astype("category")
X_te["sex"] = X_te["sex"].astype("category")

In [18]:
X_tr["embarked"] = X_tr["embarked"].astype("category")
X_te["embarked"] = X_te["embarked"].astype("category")

# Make Features!!

In [19]:
X_tr.shape, X_te.shape 

((712, 11), (179, 11))

In [20]:
train = X_tr.copy()
test = X_te.copy()

train.shape, test.shape 

((712, 11), (179, 11))

In [21]:
dict_designation = {
    'Mr.': '남성',
    'Master.': '남성',
    'Sir.': '남성',
    'Miss.': '미혼 여성',
    'Mrs.': '기혼 여성',
    'Ms.': '미혼/기혼 여성',
    'Lady.': '숙녀',
    'Mlle.': '아가씨',
    # 직업
    'Dr.': '의사',
    'Rev.': '목사',
    'Major.': '계급',
    'Don.': '교수',
    'Col.': '군인',
    'Capt.': '군인',
    # 귀족
    'Mme.': '영부인',
    'Countess.': '백작부인',
    'Jonkheer.': '귀족'
}

def add_designation(name): # 호칭 함수
    designation = "unknown"
    for key in dict_designation.keys():
        if key in name:
            designation = key
            break
    return designation

X_tr['designation'] = X_tr['name'].map(lambda x: add_designation(x))
X_te['designation'] = X_te['name'].map(lambda x: add_designation(x))

X_tr.shape, train.shape 

((712, 12), (712, 11))

In [22]:
X_tr['last_name'] = X_tr['name'].map(lambda x: x.split(',')[0])
X_te['last_name'] = X_te['name'].map(lambda x: x.split(',')[0])

X_tr.shape, train.shape 

((712, 13), (712, 11))

In [23]:
def add_ticket_number(ticket):
    try:
        ticket_split = ticket.split(' ')
        return int(ticket_split[-1])
    except:
        return 0 # ticket이 LINE인 경우

X_tr['ticket_number'] = X_tr['ticket'].map(lambda x: add_ticket_number(x)).astype("int32")
X_te['ticket_number'] = X_te['ticket'].map(lambda x: add_ticket_number(x)).astype("int32")

X_tr.shape, train.shape 

((712, 14), (712, 11))

In [24]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [27]:
def add_agg_cols(col_list):
    # 통계 컬럼 생성
    for col in col_list:
        if is_numeric_dtype(X_tr[col].dtype):
            aggs = ['mean', 'median', 'min', 'max']   
            for agg in aggs:
                new_key = col +'_'+ agg # age_mean
                X_tr[new_key] = X_tr[col].agg(agg)
                X_te[new_key] = X_tr[col].agg(agg)
        else:
            aggs = ['mode', 'nunique'] 
            for agg in aggs:
                new_key = col +'_'+ agg 
                try:
                    X_tr[new_key] = X_tr[col].agg(agg).values[0]
                    X_te[new_key] = X_tr[col].agg(agg).values[0]
                except:
                    X_tr[new_key] = X_tr[col].agg(agg)
                    X_te[new_key] = X_tr[col].agg(agg)

In [28]:
add_agg_cols(X_tr.columns)

In [29]:
train.shape, X_tr.shape 

((712, 11), (712, 50))

In [31]:
X_tr.columns 

Index(['survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'designation', 'last_name',
       'ticket_number', 'survived_mean', 'survived_median', 'survived_min',
       'survived_max', 'pclass_mode', 'pclass_nunique', 'name_mode',
       'name_nunique', 'sex_mode', 'sex_nunique', 'age_mean', 'age_median',
       'age_min', 'age_max', 'sibsp_mode', 'sibsp_nunique', 'parch_mode',
       'parch_nunique', 'ticket_mode', 'ticket_nunique', 'fare_mean',
       'fare_median', 'fare_min', 'fare_max', 'cabin_mode', 'cabin_nunique',
       'embarked_mode', 'embarked_nunique', 'designation_mode',
       'designation_nunique', 'last_name_mode', 'last_name_nunique',
       'ticket_number_mean', 'ticket_number_median', 'ticket_number_min',
       'ticket_number_max'],
      dtype='object')