## 使用Nearest Centroid Classifier預測Titanic乘客的存活機率
吳紀瑩

## 1. 讀入資料

In [77]:
import pandas as pd
url = "https://storage.googleapis.com/py_ds_basic/kaggle_titanic_train.csv"
titanic_data = pd.read_csv(url)
titanic_train = pd.read_csv(url)

In [78]:
# 用head()預設讀取前五筆資料
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [79]:
#describe()發現Age欄位有多筆遺漏值
titanic_train.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 2. 填補Age遺漏值
用中位數

In [80]:
import numpy as np

age_median = np.nanmedian(titanic_train["Age"]) # 要忽略 NaN
new_Age = np.where(titanic_train["Age"].isnull(), age_median, titanic_train["Age"])
titanic_train["Age"] = new_Age
print(titanic_train.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


## 將類別型變數轉為dummy variables

In [81]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(titanic_train["Sex"])
Pclass = titanic_train["Pclass"] #Pclass不用轉

## 建立X和y矩陣

In [82]:
titanic_X = pd.DataFrame([Pclass,
                          encoded_Sex,
                          new_Age
]).T

titanic_y = titanic_train["Survived"]

## 建立測試樣本和訓練樣本

In [83]:
#測試設30%, 訓練設70%
from sklearn.cross_validation import train_test_split

train_X, test_X, train_y, test_y = train_test_split(titanic_X, titanic_y, test_size = 0.3)

 ## 使用Nearest Centroid Classifier預測Titanic乘客的存活機率

In [84]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
NN_clf = NearestCentroid()
titanic_NN_clf = NN_clf.fit(train_X,train_y)

test_y_predicted = NN_clf.predict(test_X)

from sklearn.cross_validation import cross_val_score
scores = cross_val_score(titanic_NN_clf,titanic_x,titanic_y,cv=10,scoring='accuracy')
print(scores.mean())

0.766610770628


In [85]:
url = "https://storage.googleapis.com/py_ds_basic/kaggle_titanic_train.csv"
to_submit = pd.read_csv(url)

encoded_Sex_to_submit = label_encoder.fit_transform(to_submit["Sex"])
Pclass_to_submit = to_submit["Pclass"]
age_median = np.nanmedian(to_submit["Age"]) 
imputed_Age = np.where(to_submit["Age"].isnull(), age_median, to_submit["Age"])

to_submit_X = pd.DataFrame([encoded_Sex_to_submit,
                            Pclass_to_submit,
                            imputed_Age
]).T

to_submit_y = NN_clf.predict(to_submit_X)
print(to_submit_y)

to_submit_dict = {
    "PassengerId": to_submit["PassengerId"],
    "Survived": to_submit_y
}
to_submit_df = pd.DataFrame(to_submit_dict)

to_submit_df.to_csv("to_submit.csv", index = False)

[1 0 1 0 0 1 0 1 1 1 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1
 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 0 1 0
 1 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 0 0 0 1 1 1 1 1
 1 1 0 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 0 0 1 1
 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0
 0 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 1 0 1 1 0 0 0 0 1 1 0 0
 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 1 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 0 0
 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 0 0 0 1 1 1
 1 1 1 0 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1
 1 0 0 1 1 0 0 0 1 1 1 0 