In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
combine = [train_df, test_df]

首先检查一下数据的基本情况

In [4]:
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


# info()显示Age、Cabin、Embarked有null值

In [6]:
train_df.Cabin.value_counts()

C23 C25 C27        4
G6                 4
B96 B98            4
D                  3
C22 C26            3
E101               3
F2                 3
F33                3
B57 B59 B63 B66    2
C68                2
B58 B60            2
E121               2
D20                2
E8                 2
E44                2
B77                2
C65                2
D26                2
E24                2
E25                2
B20                2
C93                2
D33                2
E67                2
D35                2
D36                2
C52                2
F4                 2
C125               2
C124               2
                  ..
F G63              1
A6                 1
D45                1
D6                 1
D56                1
C101               1
C54                1
D28                1
D37                1
B102               1
D30                1
E17                1
E58                1
F E69              1
D10 D12            1
E50                1
A14          

# Cabin数据太少，没有明显相关性，

In [None]:
train_df.describe() #all numeric columns

- 生存率38.4%
- 平均年龄29.7

In [None]:
train_df.describe(include=['O']) # all object columns

In [None]:
train_df.Pclass.value_counts() #counts of unique values.

In [None]:
train_df[['Pclass', 'Survived']].groupby('Pclass').mean()

- Pclass=1的乘客生存率最高
- Pclass越高，乘客生存率越低
- Pclass与生存率相关，可以作为一个特征

In [None]:
train_df[['Sex', 'Survived']].groupby('Sex').mean()

- 女性乘客生存率很高
- Sex与生存率相关，可以作为一个特征

In [None]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0})
train_df.head()

In [None]:
train_df[['Name', 'Survived']]

In [None]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('(\w*)\.', expand=False)
train_df.head()

In [None]:
train_df.Title.value_counts()

In [None]:
train_df[['Title', 'Survived']].groupby('Title').mean()

In [None]:
pd.crosstab(train_df['Title'], train_df['Sex'])

In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    vc = dataset.Title.value_counts()
    topTitles = vc.index[:4].values
    rareTitles = vc.index[4:].values
    
    dataset['Title'] = dataset['Title'].replace(rareTitles, 'Rare')
    
    i = 0
    for title in np.append(topTitles, 'Rare'):
        dataset.loc[(dataset['Title'] == title), 'Title'] = i
        i += 1
        
    dataset['Title'] = dataset['Title'].astype(int)
    
train_df.Title.value_counts()

In [None]:
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
fig = plt.figure();
ax = fig.add_subplot(111)
ax.hist(train_df.Age.dropna(), bins=80)

In [None]:
#fig = plt.figure()
#ax = fig.add_subplot(121)
#ax.hist(train_df.loc[train_df['Survived']==1].Age.dropna(), bins=20)
#ax = fig.add_subplot(122)
#ax.hist(train_df.loc[train_df['Survived']==0].Age.dropna(), bins=20)
plt.show()

In [None]:
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)
plt.show()

In [None]:
# g = sns.FaceGrid(train_df, col)
# plt.scatter(train_df.Age, train_df.Fare)
fig = plt.figure();
ax = fig.add_subplot(111)
ax.scatter(train_df.Fare, train_df.Age)
plt.show()

Fare与Age相关性貌似不高

In [None]:
g = sns.FacetGrid(train_df, col='Sex')
g.map(plt.hist, 'Age', bins=20)
plt.show()

In [None]:
g = sns.FacetGrid(train_df, col='Pclass')
g.map(plt.hist, 'Age', bins=20)
plt.show()

In [None]:
g = sns.FacetGrid(train_df, col='Title')
g.map(plt.hist, 'Age', bins=20)
plt.show()

In [None]:
# 利用Title补全age字段
for dataset in combine:
    titles = dataset.Title.unique()
    guess_ages = {}
    for title in titles:
        guess_ages[title] = dataset.loc[train_df['Title'] == title].Age.median()
    for title in titles:
        dataset.loc[dataset.Age.isnull() & (dataset.Title == title), 'Age'] = guess_ages[title] 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,2,1)
ax.hist(train_df.SibSp)
ax = fig.add_subplot(1,2,2)
ax.hist(train_df.Parch)
plt.show()

In [None]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

In [None]:
freq_port = train_df.Embarked.dropna().mode()[0]
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [None]:
for dataset in combine:
    dataset['Fare'].fillna(dataset['Fare'].dropna().median(), inplace=True)

train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

In [None]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]

In [None]:
# 清理无用字段
train_df = train_df.drop(['Cabin', 'Ticket', 'Name', 'SibSp', 'Parch', 'PassengerId', 'FamilySize'], axis=1)
test_df = test_df.drop(['Cabin', 'Ticket', 'Name', 'SibSp', 'Parch', 'FamilySize'], axis=1)
combine = [train_df, test_df]

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
train_df.info()

# 开始预测

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
x_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']
x_test = test_df.drop("PassengerId", axis=1).copy()
x_train.shape, y_train.shape, x_test.shape

In [None]:
logReg = LogisticRegression()
logReg.fit(x_train, y_train)
log_pred = logReg.predict(x_test)
acc_log = round(logReg.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
svc = SVC()
svc.fit(x_train, y_train)
svc_pred = svc.predict(x_test)
acc_svc = round(svc.score(x_train, y_train) * 100, 2)
acc_svc

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
acc_knn = round(knn.score(x_train, y_train) * 100, 2)
acc_knn

In [None]:
gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
gaussian_pred = gaussian.predict(x_test)
acc_gaussian = round(gaussian.score(x_train, y_train) * 100, 2)
acc_gaussian

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(x_train, y_train)
linear_svc_pred = linear_svc.predict(x_test)
acc_linear_svc = round(linear_svc.score(x_train, y_train) * 100, 2)
acc_linear_svc

In [None]:
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
prec_pred = perceptron.predict(x_test)
acc_perceptron = round(perceptron.score(x_train, y_train) * 100, 2)
acc_perceptron

In [None]:
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
sgd_pred = sgd.predict(x_test)
acc_sgd = round(sgd.score(x_train, y_train) * 100, 2)
acc_sgd

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
dt_pred = decision_tree.predict(x_test)
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
acc_decision_tree

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
rf_pred = random_forest.predict(x_test)
# random_forest.score(x_train, y_train)
acc_random_forest = round(random_forest.score(x_train, y_train) * 100, 2)
acc_random_forest

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

In [None]:
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": dt_pred
})