In [1]:
# This is the sincere effort of Subhodeep, kindly don't copy.

In [2]:
# Data analysis
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt

# ML tools
from sklearn.ensemble import RandomForestClassifier

In [3]:
#using pandas
#training data
train_ds = pd.read_csv('train.csv')

#testing data
test_ds = pd.read_csv('test.csv')

In [4]:
#combine the above two pandas object into a list to do certain operations on both of them together.
combined = [train_ds, test_ds]

In [5]:
train_ds.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
#INFERENCE
# Survived, Sex, Embarked are categorical data.
# Pclass is categorical too but has 3 classes.
# Rest are continuous data.

# Ticket and Cabin is a mix of letters and numbers.
# All passengers have name.
# All the values of the column Sex are filled.
# Age contains missing values and are of type float64.

#In train dataset
# Cabin, Age, Embarked are empty in some cases.

In [8]:
test_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
# In test dataset
# Cabin, Age are empty in some cases.
# we need to deal with the missing values.

In [10]:
train_ds.describe() #gives stats of dataset

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
# Need to fill-in the missing values of Age, Embarked.

# PassengerId, Name is not related to survival, so we'll be dropping them.
# We're dropping cabin feature because it contains high amount of missing data (only 204 out of 891 filled in training dataset),
# Also it's not directly related to survival.

In [12]:
#We need to create features.

#People with same Parch and SibSp must be of same family, so creating a new feature called Family.
#Titles of people can be a new Feature.
#Age range can be mapped into category.
#Same as Age, Fare range can be created.

In [13]:
#Women & Children were more likely to survive.
#Upper-class members survived more.

In [14]:
#Now time to confirm some of our assumptions.
#We choose only the categorical data.
#We choose on the basis of Pclass who survived and who did not.

In [15]:
train_ds[['Pclass', 'Survived']].groupby(['Pclass']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [16]:
#Our assumption was right, Pclass = 1 survived the most.

In [17]:
train_ds[['Sex','Survived']].groupby(['Sex']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [18]:
#Female survived the most.

In [19]:
train_ds[['SibSp','Survived']].groupby(['SibSp']).mean().sort_values(by='Survived', ascending = False)

Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
1,0.535885
2,0.464286
0,0.345395
3,0.25
4,0.166667
5,0.0
8,0.0


In [20]:
train_ds[['Parch','Survived']].groupby(['Parch']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
3,0.6
1,0.550847
2,0.5
0,0.343658
5,0.2
4,0.0
6,0.0


In [21]:
# SibSp Id = 1 & Parch = 3 survived the most.

In [22]:
#Enough of all the assumptions, now we actually pre-process the data.

In [23]:
#An idea of how the dataset looks before we drop some features.
print('Shape of datasets:',train_ds.shape,test_ds.shape)

Shape of datasets: (891, 12) (418, 11)


In [24]:
#dropping columns(Ticket and Cabin) in training and testing data.
train_ds = train_ds.drop(['Ticket','Cabin'], axis='columns')
test_ds = test_ds.drop(['Ticket','Cabin'], axis = 'columns')
#here instead of axis='columns'(axis= 1) could be used.
# Axis=1 here means it will affect all the rows for the specified columns.
# Axis=0 means it will affect all the columns for the specified rows.

In [25]:
combined = [train_ds, test_ds]
#this combined is the new list containing the modified train_ds and test_ds.

In [26]:
#Lets verify this.
print('Shape of datasets:', train_ds.shape, test_ds.shape)

Shape of datasets: (891, 10) (418, 9)


In [27]:
# At this point we may want to add a new feature namely 'Title', but we might use that if we don't get good accuracy.

In [28]:
#We are dropping columns(Name and PassengerId) similarly as above.

In [29]:
train_ds = train_ds.drop(['Name','PassengerId'],axis=1)
test_ds = test_ds.drop(['Name','PassengerId'],axis=1)
#we combine them into 'combined'
combined = [train_ds, test_ds]

In [30]:
print('Shape of datasets:', train_ds.shape, test_ds.shape)

Shape of datasets: (891, 8) (418, 7)


In [31]:
#Next up we will be mapping the column 'Sex' into numerical categorical feature.
#We will do this in both datasets in one go.
for ds in combined:
    ds['Sex'] = ds['Sex'].map({'male':0, 'female':1}).astype(int)

In [32]:
train_ds.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [33]:
test_ds.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,34.5,0,0,7.8292,Q
1,3,1,47.0,1,0,7.0,S
2,2,0,62.0,0,0,9.6875,Q
3,3,0,27.0,0,0,8.6625,S
4,3,1,22.0,1,1,12.2875,S


In [34]:
#We complete the missing Embarked features with the maximum occuring data.

In [35]:
mode = (train_ds.Embarked.dropna().mode())[0] #mode returns an object and we choose the first element hence 0

In [36]:
for ds in combined:
    ds['Embarked'] = ds['Embarked'].fillna(mode)

In [37]:
train_ds.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [38]:
# TASK HERE TO MAP EMBARKED TO VALUES----->

In [39]:
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#using pandas it's simpler using pandas.get_dummies()

In [40]:
train_ds = pd.get_dummies(train_ds,columns=['Embarked'])
test_ds = pd.get_dummies(test_ds, columns=['Embarked'])
combined = [train_ds, test_ds]

In [41]:
combined[0].head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,0,0,1


In [42]:
combined[1].head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,34.5,0,0,7.8292,0,1,0
1,3,1,47.0,1,0,7.0,0,0,1
2,2,0,62.0,0,0,9.6875,0,1,0
3,3,0,27.0,0,0,8.6625,0,0,1
4,3,1,22.0,1,1,12.2875,0,0,1


In [43]:
#To solve the missing age values(works for to fill in the column with numerical continuous features)
# We have 3 approaches listed below.

In [44]:
#1 Generating mean of the existing data and filling.
#2 Try to find correlation b/w Age,Sex,Pclass. Then we guess Age value using median for various combinations of Sex & Pclass (total 6 for 3 Pclass(es) and 2 Sex(es)).
#3 Combination of #1 and #2 i.e. predict a number b/w mean & s.d., using combinations from Sex & Pclass.

In [45]:
# We will choose #1 because it is a simpler approach.

In [46]:
train_ds['Age'].fillna(train_ds['Age'].dropna().mean() ,inplace=True)
test_ds['Age'].fillna(test_ds['Age'].dropna().mean() ,inplace=True)
combined= [train_ds, test_ds]

In [47]:
train_ds.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,0,0,1


In [48]:
test_ds.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,34.5,0,0,7.8292,0,1,0
1,3,1,47.0,1,0,7.0,0,0,1
2,2,0,62.0,0,0,9.6875,0,1,0
3,3,0,27.0,0,0,8.6625,0,0,1
4,3,1,22.0,1,1,12.2875,0,0,1


In [49]:
# FILLING MISSING FARE VALUES

In [50]:
train_ds.count()

Survived      891
Pclass        891
Sex           891
Age           891
SibSp         891
Parch         891
Fare          891
Embarked_C    891
Embarked_Q    891
Embarked_S    891
dtype: int64

In [51]:
test_ds.count()

Pclass        418
Sex           418
Age           418
SibSp         418
Parch         418
Fare          417
Embarked_C    418
Embarked_Q    418
Embarked_S    418
dtype: int64

In [52]:
#Fare in train_ds is complete, we just need to fill in the test_ds.

In [53]:
test_ds['Fare'].fillna(test_ds['Fare'].dropna().median() ,inplace= True)

In [54]:
test_ds.count()

Pclass        418
Sex           418
Age           418
SibSp         418
Parch         418
Fare          418
Embarked_C    418
Embarked_Q    418
Embarked_S    418
dtype: int64

In [55]:
test_ds.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
413,3,0,30.27259,0,0,8.05,0,0,1
414,1,1,39.0,0,0,108.9,1,0,0
415,3,0,38.5,0,0,7.25,0,0,1
416,3,0,30.27259,0,0,8.05,0,0,1
417,3,0,30.27259,1,1,22.3583,1,0,0


In [56]:
train_ds.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0
2,1,3,1,26.0,0,0,7.925,0,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,0,0,1


In [57]:
x = train_ds.iloc[:,1:] #All categories except survived (Training Features)
y = train_ds.iloc[:,0] #The survived category (Training Labels)

In [58]:
x.shape

(891, 9)

In [59]:
y.shape

(891,)

In [60]:
from sklearn.cross_validation import train_test_split



In [61]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
#This makes sure there is 80% training data and 20% testing data from the total existing training data.

In [62]:
x_train.shape

(712, 9)

In [63]:
x_test.shape

(179, 9)

In [64]:
y_train.shape

(712,)

In [65]:
y_test.shape

(179,)

In [66]:
#Initial testing on a small level using RandomForestClassifier.
from sklearn.ensemble import RandomForestClassifier

In [67]:
rf = RandomForestClassifier(n_estimators=100)

In [68]:
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [69]:
rf.score(x_test, y_test)

0.8100558659217877

In [70]:
#Fairly a good start because the accuracy lies between somewhat 80~83%.

In [71]:
#Let's try with Decision Tree

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
dt = DecisionTreeClassifier(max_leaf_nodes = 50)
dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=50,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [74]:
dt.score(x_test, y_test)

0.8324022346368715

In [75]:
#similar results as above(80~83%).

In [76]:
#Creating a Neural Network for the above problem

In [77]:
#x_train = np.array(x_train)

In [78]:
#y_train = np.array(y_train).T # For transposing

In [79]:
#Create a Neural Network model

In [80]:
from keras.models import Sequential

Using TensorFlow backend.


In [81]:
model = Sequential()

In [82]:
from keras.layers import Dense, Activation

In [83]:
x_train.shape[0] #This is the number of columns of our dataset.

712

In [84]:
x_train.shape

(712, 9)

In [85]:
#creating input layer and add only one hidden layer
model.add(Dense(x_train.shape[1], init='uniform', input_dim= x_train.shape[1], activation='relu'))
#input_dim is the number of features in input layer
#just after dense(x,...), x is the number of features in the first hidden layer

  


In [86]:
#adding second hidden layer
model.add(Dense(x_train.shape[1], init='uniform', input_dim=x_train.shape[1], activation='relu'))

  


In [87]:
#creating output layer
model.add(Dense(1, activation='sigmoid'))

In [88]:
# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [89]:
#train model
model.fit(x_train, y_train, epochs=100, batch_size=8)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x202f2f37278>

In [94]:
#Accuracy of training data.
model.evaluate(x_train, y_train)[1]



0.8258426966292135

In [96]:
model.predict(x_test)

array([[0.068538  ],
       [0.11688984],
       [0.137901  ],
       [0.7781327 ],
       [0.5848113 ],
       [0.94274074],
       [0.5886358 ],
       [0.080379  ],
       [0.6315119 ],
       [0.95254594],
       [0.14725514],
       [0.03332769],
       [0.1538578 ],
       [0.12347234],
       [0.09071075],
       [0.94487214],
       [0.12494083],
       [0.58850485],
       [0.15074131],
       [0.1490915 ],
       [0.11013584],
       [0.18269481],
       [0.36547342],
       [0.14022537],
       [0.08662258],
       [0.12660632],
       [0.21933052],
       [0.12079932],
       [0.23431887],
       [0.39559612],
       [0.14563575],
       [0.4582373 ],
       [0.34583747],
       [0.39901072],
       [0.1548722 ],
       [0.09640799],
       [0.27281088],
       [0.5886358 ],
       [0.98491377],
       [0.07913068],
       [0.1807613 ],
       [0.04742509],
       [0.07929864],
       [0.08621623],
       [0.40790847],
       [0.22709478],
       [0.1461242 ],
       [0.118

In [95]:
#Accuracy of testing data.
model.evaluate(x_test, y_test)[1]



0.832402233637911

In [None]:
#A fair accuracy of about 83.24% with 2 Hidden layer NN.
# Accuracy can be further increased with normalisation.