#### Data Dictionary

* Survival	0 = No, 1 = Yes
* pclass =>	Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
* sex = Sex
* Age => Age in years
* sibsp => # of siblings / spouses aboard the Titanic
* parch => # of parents / children aboard the Titanic
* ticket => Ticket number	
* fare => Passenger fare	
* cabin => Cabin number
* embarked => Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)



#### Variable Notes
#### pclass: A proxy for socio-economic status (SES)
* 1st = Upper
* 2nd = Middle
* 3rd = Lower



#### age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5



#### sibsp: The dataset defines family relations in this way...
* Sibling = brother, sister, stepbrother, stepsister
* Spouse = husband, wife (mistresses and fiancés were ignored)



#### parch: The dataset defines family relations in this way...
* Parent = mother, father
* Child = daughter, son, stepdaughter, stepson
* Some children travelled only with a nanny, therefore parch=0 for them.




In [1]:
# read csv files

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# -- Feature Engineering --

In [2]:
df = pd.read_csv('train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
surv_col = df.iloc[:,1]
surv_col.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [5]:
train_df = df.iloc[:,2:]
train_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
pessengerId = test_df.iloc[:,0]
pessengerId.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

In [7]:
test_df.drop(['PassengerId'],axis=1,inplace=True)

test_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
train_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
concated_df = pd.concat([train_df,test_df])

concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
concated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [11]:
from sklearn import preprocessing as prep

# Label Encoding for Sex Column

In [12]:
le = prep.LabelEncoder()

concated_df.Sex =le.fit_transform(concated_df.Sex)

df.Sex[0:10]

0      male
1    female
2    female
3    female
4      male
5      male
6      male
7      male
8    female
9    female
Name: Sex, dtype: object

In [13]:
concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [14]:
concated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null int32
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int32(1), int64(3), object(4)
memory usage: 107.4+ KB


### On the some columns, there are some missing values. Firstly I need to fill that columns

In [15]:
embarked = concated_df['Embarked'].fillna('0')

embarked.unique()

array(['S', 'C', 'Q', '0'], dtype=object)

# Label Encoding for Embarked Column

In [16]:
concated_df.Embarked = le.fit_transform(embarked)

concated_df.Embarked.unique()

array([3, 1, 2, 0], dtype=int64)

In [17]:
concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,3
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,3
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,3
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,3


In [18]:
concated_df.tail()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,3,"Spector, Mr. Woolf",1,,0,0,A.5. 3236,8.05,,3
414,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9,C105,1
415,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.25,,3
416,3,"Ware, Mr. Frederick",1,,0,0,359309,8.05,,3
417,3,"Peter, Master. Michael J",1,,1,1,2668,22.3583,,1


In [19]:
concated_df.dtypes

Pclass        int64
Name         object
Sex           int32
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked      int32
dtype: object

In [20]:
print( 'Pclass:' ,concated_df.Pclass.unique())
print( 'Sex:' ,concated_df.Sex.unique())
print( 'SibSp:' ,concated_df.SibSp.unique())
print( 'Parch:' ,concated_df.Parch.unique())
print( 'Embarked:' ,concated_df.Embarked.unique())

Pclass: [3 1 2]
Sex: [1 0]
SibSp: [1 0 3 4 2 5 8]
Parch: [0 1 2 5 3 4 6 9]
Embarked: [3 1 2 0]


### I will remove Cabin columns

In [21]:
concated_df.drop(['Cabin'],axis=1,inplace=True)

concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,3
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,3
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,3
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,3


In [22]:
NameSplit = concated_df.Name.str.split('[,.]')

NameSplit.head()

0                          [Braund,  Mr,  Owen Harris]
1    [Cumings,  Mrs,  John Bradley (Florence Briggs...
2                           [Heikkinen,  Miss,  Laina]
3     [Futrelle,  Mrs,  Jacques Heath (Lily May Peel)]
4                         [Allen,  Mr,  William Henry]
Name: Name, dtype: object

In [23]:
titles = [str.strip(name[1]) for name in NameSplit.values]
titles[:10]

['Mr', 'Mrs', 'Miss', 'Mrs', 'Mr', 'Mr', 'Mr', 'Master', 'Mrs', 'Mrs']

In [24]:
# new feature

concated_df['Title'] = titles

concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,3,Mr
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,Mrs
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,3,Miss
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,3,Mrs
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,3,Mr


In [25]:
concated_df.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [26]:
# useless words: I will combine Mademoiselle and Madame into a single type

concated_df.Title.values[concated_df.Title.isin(['Mme', 'Mmle'])] = 'Mmle'

In [27]:
# keep reducing

concated_df.Title.values[concated_df.Title.isin(['Capt', 'Don', 'Major', 'Sir'])] = 'Sir'
concated_df.Title.values[concated_df.Title.isin(['Dona', 'Lady', 'the Countess', 'Jonkheer'])] = 'Lady'

In [28]:
concated_df.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Sir', 'Rev', 'Dr', 'Mmle', 'Ms',
       'Lady', 'Mlle', 'Col'], dtype=object)

In [29]:
# label encode new feature too

concated_df.Title = le.fit_transform(concated_df.Title)
concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,3,7
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,8
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,3,4
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,3,8
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,3,7


In [30]:
# new feature is family size
# number of spouses and siblings and oneself is family size

concated_df['FamilySize'] = concated_df.SibSp.values + concated_df.Parch.values + 1

In [31]:
concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,FamilySize
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,3,7,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,8,2
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,3,4,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,3,8,2
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,3,7,1


In [32]:
surnames = [str.strip(name[0]) for name in NameSplit]
surnames[:10]

['Braund',
 'Cumings',
 'Heikkinen',
 'Futrelle',
 'Allen',
 'Moran',
 'McCarthy',
 'Palsson',
 'Johnson',
 'Nasser']

In [33]:
concated_df['Surname'] = surnames
concated_df['FamilyID'] = concated_df.Surname.str.cat(concated_df.FamilySize.astype(str),sep='')
concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,FamilySize,Surname,FamilyID
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,3,7,2,Braund,Braund2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,8,2,Cumings,Cumings2
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,3,4,1,Heikkinen,Heikkinen1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,3,8,2,Futrelle,Futrelle2
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,3,7,1,Allen,Allen1


In [34]:
# I will mark if any family id as small if family size is less than or equal to 2

concated_df.FamilyID.values[concated_df.FamilySize.values <= 2] = 'Small'

concated_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,FamilySize,Surname,FamilyID
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,3,7,2,Braund,Small
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,8,2,Cumings,Small
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,3,4,1,Heikkinen,Small
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,3,8,2,Futrelle,Small
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,3,7,1,Allen,Small


In [35]:
# check up the frequency of family ids
concated_df.FamilyID.value_counts()

Small                1025
Sage11                 11
Andersson7              9
Goodwin8                8
Asplund7                7
Skoog6                  6
Rice6                   6
Panula6                 6
Fortune6                6
Palsson5                5
Lefebre5                5
Ryerson5                5
Ford5                   5
Davies3                 5
Johnston4               4
Becker4                 4
Laroche4                4
West4                   4
Herman4                 4
Dean4                   4
Allison4                4
Baclini4                4
Carter4                 4
Brown3                  4
Caldwell3               3
Danbom3                 3
Peter3                  3
McCoy3                  3
Rosblom3                3
Klasen3                 3
                     ... 
Lahtinen3               2
Hocking4                2
Frolicher-Stehli3       2
Kink3                   2
Kink-Heilmann3          2
Hamalainen3             2
Gustafsson3             2
Appleton3   

## There are too many family ids with few family members. maybe some families had different last names. I'll clean this.

In [36]:
freq = list(dict(zip(concated_df.FamilyID.value_counts().index.tolist(), concated_df.FamilyID.value_counts().values)).items())

type(freq)

list

In [37]:
freq = np.array(freq)

freq[:10]

array([['Small', '1025'],
       ['Sage11', '11'],
       ['Andersson7', '9'],
       ['Goodwin8', '8'],
       ['Asplund7', '7'],
       ['Skoog6', '6'],
       ['Rice6', '6'],
       ['Panula6', '6'],
       ['Fortune6', '6'],
       ['Palsson5', '5']], dtype='<U17')

In [38]:
freq.shape

(97, 2)

In [39]:
# select the family ids with frequency of 2 or less
freq[freq[:,1].astype(int) <= 2].shape

(36, 2)

In [40]:
freq = freq[freq[:,1].astype(int) <= 2]

In [41]:
# I'll assign 'Small' for those
concated_df.FamilyID.values[concated_df.FamilyID.isin(freq[:,0])] = 'Small'
concated_df.FamilyID.value_counts()

Small            1074
Sage11             11
Andersson7          9
Goodwin8            8
Asplund7            7
Rice6               6
Skoog6              6
Fortune6            6
Panula6             6
Lefebre5            5
Palsson5            5
Ryerson5            5
Ford5               5
Davies3             5
Brown3              4
Becker4             4
Baclini4            4
Dean4               4
Laroche4            4
West4               4
Allison4            4
Johnston4           4
Herman4             4
Carter4             4
Sandstrom3          3
Quick3              3
Compton3            3
Samaan3             3
Thayer3             3
Peacock3            3
                 ... 
McCoy3              3
Klasen3             3
Dodge3              3
Caldwell3           3
Boulos3             3
Wick3               3
Bourke3             3
Elias3              3
Crosby3             3
van Billiard3       3
Hart3               3
Taussig3            3
Abbott3             3
Touma3              3
Coutts3   

In [42]:
# label encoding for family id

concated_df.FamilyID = le.fit_transform(concated_df.FamilyID)
concated_df.FamilyID.unique()

array([50, 38, 28, 48,  2, 43,  3, 22, 31, 47, 39, 57, 24, 49, 35,  9, 21,
       17, 27, 41,  6, 37, 60, 46, 23, 30, 32,  5,  7, 11, 44, 54, 52,  0,
        1, 34, 45, 25, 58, 51, 13, 20, 59, 36, 10, 19, 55, 15, 18,  4, 42,
       14, 16, 53, 26,  8, 56, 29, 33, 12, 40], dtype=int64)

In [43]:
# I will choose usefull features
concated_reduce = concated_df[[
    'Pclass', 'Sex', 'Age', 'SibSp',
    'Parch', 'Fare', 'Title', 'Embarked', 'FamilySize',
    'FamilyID']]

concated_reduce.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Embarked,FamilySize,FamilyID
0,3,1,22.0,1,0,7.25,7,3,2,50
1,1,0,38.0,1,0,71.2833,8,1,2,50
2,3,0,26.0,0,0,7.925,4,3,1,50
3,1,0,35.0,1,0,53.1,8,3,2,50
4,3,1,35.0,0,0,8.05,7,3,1,50


In [44]:
concated_reduce.Age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  , 22.5 ,
       18.5 , 67.  , 76.  , 26.5 , 60.5 , 11.5 ,  0.33,  0.17, 38.5 ])

In [45]:
concated_reduce.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass        1309 non-null int64
Sex           1309 non-null int32
Age           1046 non-null float64
SibSp         1309 non-null int64
Parch         1309 non-null int64
Fare          1308 non-null float64
Title         1309 non-null int32
Embarked      1309 non-null int32
FamilySize    1309 non-null int64
FamilyID      1309 non-null int32
dtypes: float64(2), int32(4), int64(4)
memory usage: 92.0 KB


## There are missing values on Age Column. Therefore I will fill taking Median

In [46]:
concated_reduce['Age'].fillna(concated_reduce['Age'].median(), inplace=True)
concated_reduce['Fare'].fillna(concated_reduce['Fare'].median(), inplace=True)

In [47]:
concated_reduce.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass        1309 non-null int64
Sex           1309 non-null int32
Age           1309 non-null float64
SibSp         1309 non-null int64
Parch         1309 non-null int64
Fare          1309 non-null float64
Title         1309 non-null int32
Embarked      1309 non-null int32
FamilySize    1309 non-null int64
FamilyID      1309 non-null int32
dtypes: float64(2), int32(4), int64(4)
memory usage: 92.0 KB


# So, That Dataset (concated_reduce) is ready for spliting as Train and Test values.

In [48]:
train_final = concated_reduce.iloc[:891].copy()
test_final = concated_reduce.iloc[891:].copy()

In [49]:
train_final.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Embarked,FamilySize,FamilyID
0,3,1,22.0,1,0,7.25,7,3,2,50
1,1,0,38.0,1,0,71.2833,8,1,2,50
2,3,0,26.0,0,0,7.925,4,3,1,50
3,1,0,35.0,1,0,53.1,8,3,2,50
4,3,1,35.0,0,0,8.05,7,3,1,50


In [50]:
test_final.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Embarked,FamilySize,FamilyID
0,3,1,34.5,0,0,7.8292,7,2,1,50
1,3,0,47.0,1,0,7.0,8,3,2,50
2,2,1,62.0,0,0,9.6875,7,2,1,50
3,3,1,27.0,0,0,8.6625,7,3,1,50
4,3,0,22.0,1,1,12.2875,8,3,3,50


## At the outset, I splited some columns on the dataset. Now I will use that columns for creating Train dataset

In [51]:
X = train_final.values

X

array([[ 3.,  1., 22., ...,  3.,  2., 50.],
       [ 1.,  0., 38., ...,  1.,  2., 50.],
       [ 3.,  0., 26., ...,  3.,  1., 50.],
       ...,
       [ 3.,  0., 28., ...,  3.,  4., 29.],
       [ 1.,  1., 26., ...,  1.,  1., 50.],
       [ 3.,  1., 32., ...,  2.,  1., 50.]])

In [52]:
y = surv_col.values

y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [53]:
X.shape

(891, 10)

In [54]:
y.shape

(891,)

In [55]:
test_data = test_final.values

test_data

array([[ 3. ,  1. , 34.5, ...,  2. ,  1. , 50. ],
       [ 3. ,  0. , 47. , ...,  3. ,  2. , 50. ],
       [ 2. ,  1. , 62. , ...,  2. ,  1. , 50. ],
       ...,
       [ 3. ,  1. , 38.5, ...,  3. ,  1. , 50. ],
       [ 3. ,  1. , 28. , ...,  3. ,  1. , 50. ],
       [ 3. ,  1. , 28. , ...,  1. ,  3. , 41. ]])

# Creating Neural Network with Keras

In [56]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

Using TensorFlow backend.


In [57]:
model = Sequential()

model.add(Dense(32, init = 'uniform', activation='relu', input_dim = 10))
model.add(Dense(64, init = 'uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, init = 'uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(12, init = 'uniform', activation='relu'))
model.add(Dense(1, init = 'uniform', activation='sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [58]:
model.fit(X,y, epochs=500, batch_size = 64, verbose = 1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 1

Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 

Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 258/500
Epoch 259/500
Epoch 260/500
Epoch 261/500
Epoch 262/500
Epoch 263/500
Epoch 264/500
Epoch 265/500
Epoch 266/500
Epoch 267/500
Epoch 268/500
Epoch 269/500
Epoch 270/500
Epoch 271/500
Epoch 272/500
Epoch 273/500
Epoch 274/500
Epoch 275/500
Epoch 276/500
Epoch 277/500
Epoch 278/500
Epoch 279/500
Epoch 280/500
Epoch 281/500
Epoch 282/500
Epoch 283/500
Epoch 284/500
Epoch 285/500
Epoch 286/500
Epoch 287/500
Epoch 288/500
Epoch 289/500
Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 

Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 

Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 435/500
Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 474/500
Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 

Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


<keras.callbacks.History at 0x17a990513c8>

In [59]:
pred = model.predict(test_data)

In [60]:
pred

array([[1.11169495e-01],
       [1.21524632e-01],
       [3.66419740e-02],
       [9.47236493e-02],
       [9.35877085e-01],
       [1.20214939e-01],
       [6.39111698e-01],
       [1.28485104e-02],
       [9.92011130e-01],
       [1.10142365e-01],
       [1.06151208e-01],
       [3.53682101e-01],
       [1.00000000e+00],
       [3.13808210e-02],
       [9.99999523e-01],
       [9.97650206e-01],
       [1.19112805e-01],
       [2.40320548e-01],
       [1.97116047e-01],
       [5.73110245e-02],
       [4.29393977e-01],
       [5.04933596e-01],
       [9.99948025e-01],
       [9.27370846e-01],
       [9.99999881e-01],
       [5.65859750e-02],
       [1.00000000e+00],
       [1.85420528e-01],
       [4.94262874e-01],
       [4.02560690e-03],
       [3.37668844e-02],
       [3.48023488e-04],
       [3.45645621e-02],
       [4.48268056e-02],
       [5.72877944e-01],
       [3.79957587e-01],
       [3.66254687e-01],
       [5.07590294e-01],
       [9.34331790e-02],
       [5.16609907e-01],


In [61]:
# convert to integer
outputBin = np.zeros(0)
for i in pred:
    
    if i <= .5:
        
        outputBin = np.append(outputBin, 0)
    else:
        
        outputBin = np.append(outputBin, 1)
output = np.array(outputBin).astype(int)

In [62]:
output

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [63]:
d = {'PassengerId':pessengerId, 'Survived':output}

In [64]:
final_df = pd.DataFrame(data=d)

In [65]:
final_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [66]:
final = final_df.to_csv('new_result.csv',index=False) #convert to csv file

final

# Random Forest Classifier

In [67]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=350, max_depth=15, random_state=42)

print("train accuracy: {} ".format(rf.fit(X, y).score(X, y)))


train accuracy: 0.9741863075196409 


In [68]:
rf_pred = rf.predict(test_data)

In [69]:
rf_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [70]:
r = {'PassengerId':pessengerId, 'Survived':rf_pred}

In [71]:
final_rf = pd.DataFrame(data=r)

In [72]:
final_rf.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [73]:
final_rf = final_df.to_csv('random_forest_result.csv',index=False) #convert to csv file

final_rf