In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/titanic_train.csv')
test = pd.read_csv('data/titanic_test.csv')

print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [2]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
test.columns #survived 없음

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
# 수치형 데이터에 대한 정보를 보여준다.
# 데이터의 수량과 최대값, 최소값, 평균값, 중간값 등을 확인할 수 있다.
train.describe()

#age null 값이 많음

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# 오브젝트 타입의 데이터만 따로 추출해 본다.
# 이 데이터 중 카테고리 형태의 데이터가 무엇인지 보고 인코딩 해준다.
obj_df = train.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [6]:
obj_df[obj_df.isnull().any(axis=1)].head(20)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
4,"Allen, Mr. William Henry",male,373450,,S
5,"Moran, Mr. James",male,330877,,Q
7,"Palsson, Master. Gosta Leonard",male,349909,,S
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,347742,,S
9,"Nasser, Mrs. Nicholas (Adele Achem)",female,237736,,C
12,"Saundercock, Mr. William Henry",male,A/5. 2151,,S
13,"Andersson, Mr. Anders Johan",male,347082,,S
14,"Vestrom, Miss. Hulda Amanda Adolfina",female,350406,,S


In [7]:
obj_df["Cabin"].value_counts().head(5)

B96 B98        4
G6             4
C23 C25 C27    4
E101           3
F33            3
Name: Cabin, dtype: int64

In [2]:
train_c_df = train.copy()
test_c_df = train.copy()

In [9]:
train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
# 이런 식으로 인코딩 가능

# train.loc[train["Sex"] == "male", "Sex"] = 0
# train.loc[train["Sex"] == "female", "Sex"] = 1

# test.loc[test["Sex"] == "male", "Sex"] = 0
# test.loc[test["Sex"] == "female", "Sex"] = 1

In [3]:
# 카테고리 데이터를 인코딩 해준다.
from sklearn.preprocessing import LabelEncoder

# 성별을 0과 1로 인코딩
def gender_to_int(data):
    le = LabelEncoder()
    le.fit(["male","female"])
    data["Sex"] = le.transform(data["Sex"]) 
    return data

train_c_df = gender_to_int(train_c_df)
test_c_df = gender_to_int(test_c_df)
train_c_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [4]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [5]:
train_c_df["Embarked_C"] = train_c_df["Embarked"] == "C"
train_c_df["Embarked_S"] = train_c_df["Embarked"] == "S"
train_c_df["Embarked_Q"] = train_c_df["Embarked"] == "Q"

print(train.shape)
print(train_c_df.shape)

train_c_df[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head(10)

(891, 12)
(891, 15)


Unnamed: 0,Embarked,Embarked_C,Embarked_S,Embarked_Q
0,S,False,True,False
1,C,True,False,False
2,S,False,True,False
3,S,False,True,False
4,S,False,True,False
5,Q,False,False,True
6,S,False,True,False
7,S,False,True,False
8,S,False,True,False
9,C,True,False,False


판다스의 get_dummies로 원핫인코딩

In [6]:
# 기계가 데이터를 이해할 수 있도록 
# 카테고리 데이터를 one-hot-encoding 해준다.
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data


dummy_columns = ["Sex", "Pclass", "Embarked"]
train_dummy = dummy_data(train, dummy_columns)
test_dummy = dummy_data(test, dummy_columns)

print('원핫인코딩 전 shape')
print(train.shape)
print(test.shape)

print('get_dummies로 원핫인코딩 후 shape')
print(train_dummy.shape)
print(test_dummy.shape)

원핫인코딩 전 shape
(891, 12)
(418, 11)
get_dummies로 원핫인코딩 후 shape
(891, 17)
(418, 16)


In [7]:
# 사용하지 않을 컬럼을 제거해 피처로 사용할 컬럼만 남겨둔다.
def drop_not_concerned(data, columns):
    return data.drop(columns, axis=1)

not_concerned_columns = ["PassengerId", "Name", "Ticket", "Cabin"]
X_train = drop_not_concerned(train_dummy, not_concerned_columns)
X_train = X_train.drop('Survived', axis=1)
X_test = drop_not_concerned(test_dummy, not_concerned_columns)

In [8]:
X_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,1,0,0,1,0,0
2,26.0,0,0,7.925,1,0,0,0,1,0,0,1
3,35.0,1,0,53.1,1,0,1,0,0,0,0,1
4,35.0,0,0,8.05,0,1,0,0,1,0,0,1
