In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix

pd.pandas.set_option('display.max_columns', None)

In [2]:
dataset = pd.read_csv('../input/titanic/train.csv')

In [3]:
dataset.head()

In [4]:
dataset.shape

In [6]:
dataset.info()

In [7]:
dataset.describe()

In [8]:
dataset.describe(include=['O'])

In [9]:
for feature in dataset.columns:
    print(feature,dataset[feature].isnull().sum(),'of',dataset.shape[0],'values are missing')

# Test data

In [11]:
test = pd.read_csv('../input/titanic/test.csv')

In [12]:
test.shape

In [13]:
for feature in test.columns:
    print(feature,test[feature].isnull().sum(),'of',test.shape[0],'values are missing')

In [14]:
test.describe(include = 'O')

## Relationship between survival and Features

In [15]:
survived = dataset[dataset['Survived']==1]
died = dataset[dataset['Survived']==0]

print('Survived %i (%.1f%%)'%(len(survived),len(survived)/len(dataset)*100))
print('Died %i (%.1f%%)'%(len(died),len(died)/len(dataset)*100))
print('Total %i '%(len(dataset)))

In [16]:
#Pclass dependence
dataset.Pclass.value_counts()

In [17]:
dataset.groupby('Survived')['Pclass'].value_counts()

Survival seems to be correlated with the Pclass

In [18]:
dataset[['Pclass', 'Survived']].groupby(['Pclass']).mean()

In [21]:
sns.barplot(x='Pclass', y='Survived', data=dataset)

In [22]:
sns.heatmap(dataset.corr())

## Relation between sex and survival

In [23]:
dataset.Sex.value_counts()

In [24]:
dataset.groupby('Survived').Sex.value_counts()

In [25]:
dataset[['Sex','Survived']].groupby('Sex').mean()

In [26]:
sns.barplot(x = 'Sex', y = 'Survived', data = dataset)

In [29]:
tab = pd.crosstab(dataset['Pclass'], dataset['Sex'])
print (tab)

In [32]:
tab.div(tab.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.xlabel('Pclass')
plt.ylabel('Percentage')

In [33]:
sns.factorplot('Sex', 'Survived', hue='Pclass', size=4, aspect=2, data=dataset)

In [34]:
sns.factorplot(x='Pclass', y='Survived', hue='Sex', col='Embarked', data=dataset)

Females from first and second Pclass survived almost totally, and the main part of females died at the 3rd class. Males from 1st class don't have much bigger survival probability than ones from other classes


## Embarked vs Survival

In [35]:
dataset.Embarked.value_counts()

In [36]:
dataset.groupby('Embarked').Survived.value_counts()

In [37]:
sns.barplot(x = 'Embarked',y = 'Survived', data = dataset)

In [38]:
dataset[['Embarked','Survived']].groupby('Embarked').mean()

## Parch vs Survived

In [39]:
dataset['Parch'].value_counts()

In [40]:
dataset.groupby('Survived').Parch.value_counts()

In [41]:
sns.barplot(x = 'Parch',y = 'Survived', ci = None,data = dataset)

In [42]:
dataset[['Parch','Survived']].groupby('Parch').mean()

## SibSp vs Survived

In [43]:
dataset.SibSp.value_counts()

In [44]:
dataset.groupby('Survived').SibSp.value_counts()

In [45]:
dataset[['SibSp','Survived']].groupby('SibSp').mean()

In [46]:
sns.barplot(x = 'SibSp', y = 'Survived', ci = None, data = dataset)

## Age vs Survival

In [47]:
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)

sns.violinplot(x="Embarked", y="Age", hue="Survived", data=dataset, split=True, ax=ax1)
sns.violinplot(x="Pclass", y="Age", hue="Survived", data=dataset, split=True, ax=ax2)
sns.violinplot(x="Sex", y="Age", hue="Survived", data=dataset, split=True, ax=ax3)

Almost all children from Pclass = 2 survived. Young people from Pclass = 1 survived more than from other classes. First class has very few children compared to other classes and many old people. Most childern of 3rd class survived.

Most male children (between age 0 to 14) survived. Females with age between 18 to 40 have better survival chance.


In [48]:
total_survived = dataset[dataset['Survived']==1]
total_not_survived = dataset[dataset['Survived']==0]
male_survived = dataset[(dataset['Survived']==1) & (dataset['Sex']=="male")]
female_survived = dataset[(dataset['Survived']==1) & (dataset['Sex']=="female")]
male_not_survived = dataset[(dataset['Survived']==0) & (dataset['Sex']=="male")]
female_not_survived = dataset[(dataset['Survived']==0) & (dataset['Sex']=="female")]

plt.figure(figsize=[15,5])
plt.subplot(111)
sns.distplot(total_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='blue')
sns.distplot(total_not_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='red', axlabel='Age')
plt.figure(figsize=[15,5])

plt.subplot(121)
sns.distplot(female_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='blue')
sns.distplot(female_not_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='red', axlabel='Female Age')

plt.subplot(122)
sns.distplot(male_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='blue')
sns.distplot(male_not_survived['Age'].dropna().values, bins=range(0, 81, 1), kde=False, color='red', axlabel='Male Age')

Combining both male and female, we can see that children with age between 0 to 5 have better chance of survival. Females with age between "18 to 40" and "50 and above" have higher chance of survival. Males with age between 0 to 14 have better chance of survival.


## Correlation

In [49]:
plt.figure(figsize=(15,6))
sns.heatmap(dataset.drop('PassengerId',axis=1).corr(), vmax=0.6, square= True, annot=True)

# Feature Extraction

## Name Feature

Let's first extract titles from Name column.


In [50]:
train_test_data = [dataset, test] # combining train and test dataset

for data in train_test_data:
    data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.')

In [52]:
test['Title'].head()   

As we see, we added a new column with a title of a person


In [53]:
pd.crosstab(dataset['Title'], dataset['Sex'])

We now replace some less common titles with the name "Other".


In [54]:
for data in train_test_data:
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', \
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    
dataset[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

After that, we convert the categorical Title values into numeric form.


In [55]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for data in train_test_data:
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'] = data['Title'].fillna(0)

In [56]:
dataset.head()

## Sex Feature

In [67]:
#Replaces sex feature data with numbers
title_mapping = {'female':1,'male':0}
for data in train_test_data:
    data['Sex'] = data['Sex'].map(title_mapping)

## Embarked Feature

In [59]:
## Embarked 
#We replace nan value with the most popular value
for data in train_test_data:
    print(data.Embarked.value_counts())
    print(data.Embarked.isnull().sum())

In [60]:
for data in train_test_data:
    data.Embarked = data.Embarked.fillna('S')

In [61]:
for data in train_test_data:
    data.Embarked = data.Embarked.map({'S':0,'C':1,'Q':2})

In [68]:
dataset.head()

## Age feature

Substitute NaN value of the Age column with a random number within one st.dev from the mean value.


In [70]:
for data in train_test_data:
    age_mean = data.Age.mean()
    age_std = data.Age.std()
    age_null_value = data.Age.isnull().sum()
    
    age_dataset = np.random.randint(age_mean-age_std,age_mean+age_std,size = age_null_value)
    data['Age'][np.isnan(data.Age)] = age_dataset
    data['Age'] = data['Age'].astype(int)

In [71]:
#Divide the age into 5 groups
dataset['AgeBand'] = pd.cut(dataset['Age'], 5)  
#dataset.AgeBand.head()
print (dataset[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean())

In [72]:
for data in train_test_data:
    data.loc[ data['Age'] <= 16, 'Age'] = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age'] = 4

In [73]:
dataset.head()

## Fare Feature

In [74]:
for data in train_test_data:
    data.Fare[np.isnan(data.Fare)] = data.Fare.mean()

In [75]:
#Divide the age into 5 groups
dataset['FareBand'] = pd.qcut(dataset['Fare'], 4)  
#dataset.AgeBand.head()
print (dataset[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean())

In [76]:
for data in train_test_data:
    data.loc[ data['Fare'] <= 7.91, 'Fare'] = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare']   = 2
    data.loc[ data['Fare'] > 31, 'Fare'] = 3
    data['Fare'] = data['Fare'].astype(int)

In [77]:
dataset.head()

## SibSp and Parch

In [78]:
for data in train_test_data:
    data['Family'] = data['SibSp'] + data['Parch'] + 1
dataset[['Family','Survived']].groupby(['Family'],as_index = False).mean()

As we see here the most significant probability of Survival is for families with 2-4 members.


In [79]:
for data in train_test_data:
    data['IsAlone'] = np.where(data['Family']==1,1,0)

In [80]:
dataset[['IsAlone','Survived']].groupby(['IsAlone'],as_index = False).mean()

By adding IsALone column, we can see, that people who were on the ship alone, had smaller probability of Survival.

# Feature Selection

We drop all columns from the training dataset, which we substituted with better ones. We drop auxiliary columns. We also drop the Cabin column, because of its numerous missing values.


In [81]:
dataset = dataset.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','AgeBand','FareBand','Family'],axis = 1)
test = test.drop(['Name','SibSp','Parch','Ticket','Cabin','Family'],axis = 1)

# Modeling

In [82]:
X_train = dataset.drop('Survived',axis = 1)
y_train = dataset['Survived']
X_test = test.drop('PassengerId',axis = 1)

We are going to use classification algorithms, as we need to classify whether a passenger survived or not. 
Used algorithms:
<br>Logistic Regression
<br>Support Vector Machines
<br>Linear Support Vector Machines
<br>Stochastic Gradient Descent
<br>k-Nearest Neighbours
<br>Decision Tree
<br>Random Forest
<br>Perceptron
<br>Naive Bayes

In [84]:
models = [LogisticRegression(),SVC(),LinearSVC(),DecisionTreeClassifier(),\
          RandomForestClassifier(n_estimators = 100),KNeighborsClassifier(10),\
         SGDClassifier(max_iter = 100, tol = None),Perceptron(max_iter = 100, tol = None),GaussianNB()]

In [85]:
models_list = ['Logistic Regression','Support Vector Machines','Linear Support Vector Machines',\
               'Decision Tree','Random Forest','k-Nearest Neighbours', 'Stochastic Gradient Descent',\
               'Perceptron','Naive Bayes']
accuracy_list = []

In [86]:
for i in models:
    i.fit(X_train,y_train)
    y_pred_log = i.predict(X_test)
    accuracy = round(100*i.score(X_train,y_train),2)
    print(str(i) + ' Accuracy {}%'.format(accuracy))
    accuracy_list.append(accuracy)

In [87]:
models_dataframe = pd.DataFrame({'Models': models_list,'Accuracy':accuracy_list})
models_dataframe.set_index('Models', inplace = True)
models_dataframe.sort_values(by = ['Accuracy'], ascending=False)

The highest accuracies we got are done by Decision Tree and Random Forest models. We'll use Random Forest as it's better suited for managing overfitting problem. 

### Confusion matrix

In [88]:
model = RandomForestClassifier(n_estimators = 100)
model.fit(X_train,y_train)
y_pred_log = model.predict(X_train)
accuracy = round(100*model.score(X_train,y_train),2)
print(str(model) + ' Accuracy {}%'.format(accuracy))

In [89]:
conf_mat = confusion_matrix(y_train,y_pred_log)
rows = ['Survived','Not Survived']
cols = ['Predicted Survived','Predicted Not Survived']
conf_mat_frame = pd.DataFrame(conf_mat,index = rows, columns = cols)

np.set_printoptions(precision = 2)

print('Confusion Matrix in numbers')
print(conf_mat)
print('')
print('')

print('Confusion Matrix in percents')
conf_mat_perc = conf_mat.astype(float)/conf_mat.sum(axis = 1)[:,np.newaxis]
print(conf_mat_perc)
print('')
print('')

conf_mat_perc_frame = pd.DataFrame(conf_mat_perc,index = rows, columns = cols)

plt.figure.figsize = (15,5)

plt.subplot(121)
sns.heatmap(conf_mat_frame, annot = True, fmt='d')

plt.subplot(122)
sns.heatmap(conf_mat_perc_frame, annot = True)

#### Create submission file to Kaggle

In [90]:
test.head()

In [91]:
y_pred = model.predict(X_test)

In [92]:
answer = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':y_pred})

In [95]:
answer.head()

In [93]:
answer.to_csv('submission.csv',index = False)

In [94]:
c = pd.read_csv('../input/titanic/gender_submission.csv')
c.head()