In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

## 데이터 시각화 관련
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid') # matplotlib의 스타일에 관련한 함수
%matplotlib inline 

## Scikit-Learn의 다양한 머신러닝 모듈을 불러옵니다.
## 분류 알고리즘 중에서 선형회귀, 서포트벡터머신, 랜덤포레스트, K-최근접이웃 알고리즘을 사용해보려고 합니다.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df_train = pd.read_csv("titanic/train.csv")
df_test = pd.read_csv("titanic/test.csv")

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
df_train["Pclass"].value_counts().sort_values(ascending = False)

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [8]:
pclass_tranin_dummies = pd.get_dummies(df_train["Pclass"])
pclass_test_dummies = pd.get_dummies(df_test["Pclass"])

# 서수형 Pclass 삭제
df_train.drop("Pclass", axis=1, inplace = True)
df_test.drop("Pclass", axis=1, inplace = True)

#  df1.join(df2) => 행 기준 데이터 프레임 조인
df_train = df_train.join(pclass_tranin_dummies)
df_test = df_test.join(pclass_test_dummies)

In [9]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,1,2,3
0,1,0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,False,False,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True,False,False
2,3,1,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,False,False,True
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,True,False,False
4,5,0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,False,False,True


In [10]:
sex_train_dummies = pd.get_dummies(df_train["Sex"])
sex_test_dummies = pd.get_dummies(df_test["Sex"])

sex_train_dummies.columns = ["Female", "Male"]
sex_test_dummies.columns = ["Female", "Male"]

df_train.drop("Sex", axis = 1, inplace = True)
df_test.drop("Sex", axis = 1, inplace = True)

# DataFrame에 다시 대입 해줘야함을 잊지 말자!
# 변수에 다시 적용해주지 않으면 기존 DataFrame에는 적용이 되지 않는다.
df_train = df_train.join(sex_train_dummies)
df_test = df_test.join(sex_test_dummies)

In [11]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,1,2,3,Female,Male
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,False,False,True,False,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,True,False,False,True,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,False,False,True,True,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,True,False,False,True,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,False,False,True,False,True


In [12]:
# fillna를 할 때도 inplace를 해야함을 잊지 않는다.
df_train["Age"].fillna(df_train["Age"].mean(), inplace = True)
df_test["Age"].fillna(df_test["Age"].mean(), inplace = True)

In [13]:
# df_test => Age컬럼의 null값 sum
df_test["Age"].isnull().sum()

0

In [14]:
# df_test => Age컬럼의 null값 sum
df_train["Age"].isnull().sum()

0

In [15]:
df_test["Fare"].fillna(0, inplace = True)

In [16]:
df_test["Fare"].isnull().sum()

0

In [17]:
df_train = df_train.drop(["Cabin"], axis = 1)
df_test = df_test.drop(["Cabin"], axis = 1)

In [18]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,1,2,3,Female,Male
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,S,False,False,True,False,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,True,False,False,True,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,S,False,False,True,True,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,S,True,False,False,True,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,S,False,False,True,False,True


In [19]:
# X_train 변수에 Survived column을 삭제한 DataFrame을 저장한다.
# 원본 DataFrame은 영향이 없다.
X_train = df_train.drop("Survived", axis = 1)

# Y_train 변수에 Survived column을 저장한다. (Series데이터라고 볼 수 있다.)
Y_train = df_train["Survived"]
# X_test 변수에 df_test의 PassengerId column .......
X_test = df_test.drop("PassengerId", axis = 1).copy()

In [20]:
id(df_train), id((Y_train))

(2554473968544, 2554456352896)

In [21]:
id(df_test), id(X_test)

(2554473971136, 2554456354384)

In [22]:
X_train


Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,1,2,3,Female,Male
0,1,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,S,False,False,True,False,True
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C,True,False,False,True,False
2,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,S,False,False,True,True,False
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,S,True,False,False,True,False
4,5,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,S,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,"Montvila, Rev. Juozas",27.000000,0,0,211536,13.0000,S,False,True,False,False,True
887,888,"Graham, Miss. Margaret Edith",19.000000,0,0,112053,30.0000,S,True,False,False,True,False
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",29.699118,1,2,W./C. 6607,23.4500,S,False,False,True,True,False
889,890,"Behr, Mr. Karl Howell",26.000000,0,0,111369,30.0000,C,True,False,False,False,True


In [24]:
women = df_train.loc[df_train.Female]["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [25]:
men =df_train.loc[df_train.Male]["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924
