In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

import pandas as pd

In [35]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv('gender_submission.csv')

## 觀察資料缺失值情況

In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [37]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## 查看資料基本統計數據

In [38]:
train.describe()

  interpolation=interpolation)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [39]:
test.describe()

  interpolation=interpolation)


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,,0.0,0.0,
50%,1100.5,3.0,,0.0,0.0,
75%,1204.75,3.0,,1.0,0.0,
max,1309.0,3.0,76.0,8.0,9.0,512.3292


## 選出想要的特徵

In [40]:
selected_features = ['Pclass', 'Age', 'Fare', 'Sex']

In [41]:
X_train = train[selected_features]

In [42]:
y_train = train['Survived']

In [43]:
X_test = test[selected_features]

## 處理缺失值

In [44]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
Pclass    891 non-null int64
Age       714 non-null float64
Fare      891 non-null float64
Sex       891 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 27.9+ KB


In [45]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
Pclass    418 non-null int64
Age       332 non-null float64
Fare      417 non-null float64
Sex       418 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 13.1+ KB


In [46]:
X_train['Age'].fillna(X_train['Age'].mean()+5, inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


## 檢查補值狀況

In [47]:
X_train.dtypes

Pclass      int64
Age       float64
Fare      float64
Sex        object
dtype: object

In [48]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
Pclass    418 non-null int64
Age       418 non-null float64
Fare      418 non-null float64
Sex       418 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 13.1+ KB


In [49]:
X_train.head()

Unnamed: 0,Pclass,Age,Fare,Sex
0,3,22.0,7.25,male
1,1,38.0,71.2833,female
2,3,26.0,7.925,female
3,1,35.0,53.1,female
4,3,35.0,8.05,male


In [50]:
X_test.head()

Unnamed: 0,Pclass,Age,Fare,Sex
0,3,34.5,7.8292,male
1,3,47.0,7.0,female
2,2,62.0,9.6875,male
3,3,27.0,8.6625,male
4,3,22.0,12.2875,female


## 把類別變數轉成Dummy

In [51]:
X_train = pd.get_dummies(X_train)

## 針對Sex的column進行轉換

In [52]:
X_test_Sex = pd.get_dummies(X_test["Sex"])

In [53]:
X_test_Sex.head()

Unnamed: 0,female,male
0,0.0,1.0
1,1.0,0.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0


## 將X_test 和 X_test_Sex 兩個dataframe合併

In [54]:
X_test = X_test.join(X_test_Sex)

In [55]:
X_test.head()

Unnamed: 0,Pclass,Age,Fare,Sex,female,male
0,3,34.5,7.8292,male,0.0,1.0
1,3,47.0,7.0,female,1.0,0.0
2,2,62.0,9.6875,male,0.0,1.0
3,3,27.0,8.6625,male,0.0,1.0
4,3,22.0,12.2875,female,1.0,0.0


In [56]:
X_test = X_test.drop("Sex", axis=1)

In [57]:
X_test.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,34.5,7.8292,0.0,1.0
1,3,47.0,7.0,1.0,0.0
2,2,62.0,9.6875,0.0,1.0
3,3,27.0,8.6625,0.0,1.0
4,3,22.0,12.2875,1.0,0.0


## 利用SVM 分類

In [58]:
clf = SVC()

## 使用 K-Fold Cross Validation 看一下分類的準確度

In [59]:
cross_val_score(clf, X_train, y_train, cv=10).mean()

0.69930626489615255

## 訓練模型

In [61]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## 將訓練好的模型拿來做預測

In [63]:
survived_predict = clf.predict(X_test)

## 結果輸出

In [64]:
submission=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':survived_predict})

In [65]:
submission.to_csv('submission.csv', index=False)