 # *Feature Construction*

In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [31]:
data = pd.read_csv('data/titanic/train.csv',usecols=['Survived','Pclass','Age','SibSp','Parch'])
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [32]:
data['Age'].fillna(data['Age'].mean(),inplace=True)

In [33]:
X = data.drop('Survived', axis=1)
y = data['Survived']
lr = LogisticRegression()
cv_score = cross_val_score(lr, X, y, cv=10, scoring='accuracy')
print('Accuracy without feature construction ', cv_score.mean()*100)

Accuracy without feature construction  69.59176029962548


In [34]:
data['Family_No.'] = data['SibSp']+data['Parch']+1
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_No.
0,0,3,22.0,1,0,2
1,1,1,38.0,1,0,2
2,1,3,26.0,0,0,1
3,1,1,35.0,1,0,2
4,0,3,35.0,0,0,1


In [35]:
def checksize(num):
    if num==1:
        return 0
    elif num>1 and num<=4:
        return 1
    elif num>4:
        return 2

In [36]:
data['Family_Size'] = data['Family_No.'].apply(checksize)
data.sample(10)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Family_No.,Family_Size
561,0,3,40.0,0,0,1,0
189,0,3,36.0,0,0,1,0
772,0,2,57.0,0,0,1,0
362,0,3,45.0,0,1,2,1
625,0,1,61.0,0,0,1,0
133,1,2,29.0,1,0,2,1
113,0,3,20.0,1,0,2,1
19,1,3,29.699118,0,0,1,0
871,1,1,47.0,1,1,3,1
319,1,1,40.0,1,1,3,1


In [37]:
data = data.drop(['SibSp','Parch', 'Family_No.'],axis=1)
data.head()

Unnamed: 0,Survived,Pclass,Age,Family_Size
0,0,3,22.0,1
1,1,1,38.0,1
2,1,3,26.0,0
3,1,1,35.0,1
4,0,3,35.0,0


In [38]:
X = data.drop('Survived', axis=1)
y = data['Survived']
lr = LogisticRegression()
cv_score_fc = cross_val_score(lr, X, y, cv=10, scoring='accuracy')
print('Accuracy with feature construction ', cv_score_fc.mean()*100)

Accuracy without feature construction  70.04494382022472


In [40]:
print('Accuracy without feature construction ', cv_score.mean()*100)
print('Accuracy with feature construction ', cv_score_fc.mean()*100) 

## Improvement in accuracy by feature construction

Accuracy without feature construction  69.59176029962548
Accuracy with feature construction  70.04494382022472


# *Feature Splitting*

In [65]:
data = pd.read_csv('data/titanic/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [66]:
data['Salutation']=data['Name'].str.split(',', expand=True)[1].str.split('.',expand=True)[0]
data['Name']=data['Name'].str.split(',', expand=True)[1].str.split('.',expand=True)[1]
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Salutation
0,1,0,3,Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,William Henry,male,35.0,0,0,373450,8.05,,S,Mr


In [67]:
data[['Salutation','Name']]

Unnamed: 0,Salutation,Name
0,Mr,Owen Harris
1,Mrs,John Bradley (Florence Briggs Thayer)
2,Miss,Laina
3,Mrs,Jacques Heath (Lily May Peel)
4,Mr,William Henry
...,...,...
886,Rev,Juozas
887,Miss,Margaret Edith
888,Miss,"Catherine Helen ""Carrie"""
889,Mr,Karl Howell


In [53]:
(data.groupby('Salutation').mean()['Survived']).sort_values(ascending=False)

Salutation
 the Countess    1.000000
 Mlle            1.000000
 Sir             1.000000
 Ms              1.000000
 Lady            1.000000
 Mme             1.000000
 Mrs             0.792000
 Miss            0.697802
 Master          0.575000
 Col             0.500000
 Major           0.500000
 Dr              0.428571
 Mr              0.156673
 Jonkheer        0.000000
 Rev             0.000000
 Don             0.000000
 Capt            0.000000
Name: Survived, dtype: float64