In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Acquire data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]

# Data Preparation

## Dropping features
Based on our assumptions, we want to drop the Cabin and Ticket features

In [3]:
# Drop Ticket and Cabin columns, use axis=1 to refer to columns
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

Name and PassengerId columns must be dropped too, but first we have to create the Title feature

## Creating new features

### Title feature

In [4]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('(\w+\.)', expand=False)

In [5]:
display(combine[0]["Title"].value_counts())
display(combine[1]["Title"].value_counts())

Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
Ms.            1
Lady.          1
Sir.           1
Countess.      1
Jonkheer.      1
Capt.          1
Mme.           1
Don.           1
Name: Title, dtype: int64

Mr.        240
Miss.       78
Mrs.        72
Master.     21
Col.         2
Rev.         2
Dona.        1
Ms.          1
Dr.          1
Name: Title, dtype: int64

- We can replace some rare titles like (Lady, Countess, Capt, Col, Don, Dr, Major, Rev, Sir, Jonkheer, Dona) by Rare name
- Replace Mlle by Miss
- Replace Ms by Miss
- Replace Mme by Mrs

In [6]:
rare_replacements = ['Lady.', 'Countess.', 'Capt.', 'Col.', 'Don.', 'Dr.', 'Major.',
                     'Rev.', 'Sir.', 'Jonkheer.', 'Dona.']

for dataset in combine:
    dataset["Title"] = dataset["Title"].replace(rare_replacements, "Rare")
    dataset["Title"] = dataset["Title"].replace("Mlle.", "Miss.")
    dataset["Title"] = dataset["Title"].replace("Ms.", "Miss.")
    dataset["Title"] = dataset["Title"].replace("Mme.", "Mrs.")

In [7]:
display(combine[0]["Title"].value_counts())
display(combine[1]["Title"].value_counts())

Mr.        517
Miss.      185
Mrs.       126
Master.     40
Rare        23
Name: Title, dtype: int64

Mr.        240
Miss.       79
Mrs.        72
Master.     21
Rare         6
Name: Title, dtype: int64

In [8]:
# Look survival rates depending of Title
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master.,0.575
1,Miss.,0.702703
2,Mr.,0.156673
3,Mrs.,0.793651
4,Rare,0.347826


- Passengers with Miss or Mrs title are more likely to survive. Both are female titles
- Males with Mr title are more likely to die, only a 15.66% survived

**Now we can drop Name and PassengerId features**

In [9]:
train_df = combine[0]
test_df = combine[1]

train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

combine = [train_df, test_df]

train_df.shape, test_df.shape

((891, 9), (418, 9))

## Converting categorical features to numerical

In [10]:
# Now we will use a map function to map values from two series having one column same.
# Last columns of the first series must be equal to index column of second series
title_mapping = {"Mr.": 1, "Miss.": 2, "Mrs.": 3, "Master.": 4, "Rare": 5}
sex_mapping = {'female': 1, 'male': 0}

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

train_df = combine[0]
test_df = combine[1]

In [11]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,3
2,1,3,1,26.0,0,0,7.925,S,2
3,1,1,1,35.0,1,0,53.1,S,3
4,0,3,0,35.0,0,0,8.05,S,1


# Handle missing values
We will perform these operations:
- Imputation on "Age"
- Delete missing rows of "Embarked"

In [12]:
# Drop rows with missing values
for dataset in combine:
    # Impute missing values on Age with the mean age of passengers
    dataset['Age'] = dataset['Age'].fillna(dataset["Age"].mean())
    # Drop the rows that still have missing values
    dataset.dropna(axis=0, inplace=True)

combine[0].shape, combine[1].shape

((891, 9), (418, 9))