In [1]:
# This is the sincere effort of Subhodeep, kindly don't copy.

In [2]:
# Data analysis
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt

# ML tools
from sklearn.ensemble import RandomForestClassifier

In [3]:
#using pandas
#training data
train_ds = pd.read_csv('train.csv')

#testing data
test_ds = pd.read_csv('test.csv')

In [4]:
#combine the above two pandas object into a list to do certain operations on both of them together.
combined = [train_ds, test_ds]

In [5]:
train_ds.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
#INFERENCE
# Survived, Sex, Embarked are categorical data.
# Pclass is categorical too but has 3 classes.
# Rest are continuous data.

# Ticket and Cabin is a mix of letters and numbers.
# All passengers have name.
# All the values of the column Sex are filled.
# Age contains missing values and are of type float64.

#In train dataset
# Cabin, Age, Embarked are empty in some cases.

In [8]:
test_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
# In test dataset
# Cabin, Age are empty in some cases.
# we need to deal with the missing values.

In [10]:
train_ds.describe() #gives stats of dataset

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
# Need to fill-in the missing values of Age, Embarked.

# PassengerId, Name is not related to survival, so we'll be dropping them.
# We're dropping cabin feature because it contains high amount of missing data (only 204 out of 891 filled in training dataset),
# Also it's not directly related to survival.

In [12]:
#We need to create features.

#People with same Parch and SibSp must be of same family, so creating a new feature called Family.
#Titles of people can be a new Feature.
#Age range can be mapped into category.
#Same as Age, Fare range can be created.

In [13]:
#Women & Children were more likely to survive.
#Upper-class members survived more.

In [14]:
#Now time to confirm some of our assumptions.
#We choose only the categorical data.
#We choose on the basis of Pclass who survived and who did not.

In [15]:
train_ds[['Pclass', 'Survived']].groupby(['Pclass']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [16]:
#Our assumption was right, Pclass = 1 survived the most.

In [17]:
train_ds[['Sex','Survived']].groupby(['Sex']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [18]:
#Female survived the most.

In [19]:
train_ds[['SibSp','Survived']].groupby(['SibSp']).mean().sort_values(by='Survived', ascending = False)

Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
1,0.535885
2,0.464286
0,0.345395
3,0.25
4,0.166667
5,0.0
8,0.0


In [20]:
train_ds[['Parch','Survived']].groupby(['Parch']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
3,0.6
1,0.550847
2,0.5
0,0.343658
5,0.2
4,0.0
6,0.0


In [21]:
# SibSp Id = 1 & Parch = 3 survived the most.

In [22]:
#Enough of all the assumptions, now we actually pre-process the data.

In [23]:
#An idea of how the dataset looks before we drop some features.
print('Shape of datasets:',train_ds.shape,test_ds.shape)

Shape of datasets:  (891, 12) (418, 11)


In [24]:
#dropping columns(Ticket and Cabin) in training and testing data.
train_ds = train_ds.drop(['Ticket','Cabin'], axis='columns')
test_ds = test_ds.drop(['Ticket','Cabin'], axis = 'columns')
#here instead of axis='columns'(axis= 1) could be used.
# Axis=1 here means it will affect all the rows for the specified columns.
# Axis=0 means it will affect all the columns for the specified rows.

In [25]:
combined = [train_ds, test_ds]
#this combined is the new list containing the modified train_ds and test_ds.

In [26]:
#Lets verify this.
print('Shape of datasets:', train_ds.shape, test_ds.shape)

Shape of datasets: (891, 10) (418, 9)


In [27]:
# At this point we may want to add a new feature namely 'Title', but we might use that if we don't get good accuracy.

In [28]:
#We are dropping columns(Name and PassengerId) similarly as above.

In [29]:
train_ds = train_ds.drop(['Name','PassengerId'],axis=1)
test_ds = test_ds.drop(['Name','PassengerId'],axis=1)
#we combine them into 'combined'
combined = [train_ds, test_ds]

In [30]:
print('Shape of datasets:', train_ds.shape, test_ds.shape)

Shape of datasets: (891, 8) (418, 7)
