In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [14]:
df = pd.read_csv('train.csv',usecols=['Age','Pclass','SibSp','Parch','Survived'])
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0
...,...,...,...,...,...
886,0,2,27.0,0,0
887,1,1,19.0,0,0
888,0,3,,1,2
889,1,1,26.0,0,0


In [15]:
df.dropna(inplace=True)
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0
...,...,...,...,...,...
885,0,3,39.0,0,5
886,0,2,27.0,0,0
887,1,1,19.0,0,0
889,1,1,26.0,0,0


In [16]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [17]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch
0,3,22.0,1,0
1,1,38.0,1,0
2,3,26.0,0,0
3,1,35.0,1,0
4,3,35.0,0,0
...,...,...,...,...
885,3,39.0,0,5
886,2,27.0,0,0
887,1,19.0,0,0
889,1,26.0,0,0


In [18]:
y

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 714, dtype: int64

# Features Construction

### before feature construction checking validation

In [19]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.6933333333333332

In [20]:
X['Family_size'] = X['SibSp'] + X['Parch']
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,22.0,1,0,1
1,1,38.0,1,0,1
2,3,26.0,0,0,0
3,1,35.0,1,0,1
4,3,35.0,0,0,0
...,...,...,...,...,...
885,3,39.0,0,5,5
886,2,27.0,0,0,0
887,1,19.0,0,0,0
889,1,26.0,0,0,0


In [21]:
def myfunc(num):
    if num == 1:
        #alone
        return 0
    elif 1 < num <= 4:
        # small family
        return 1
    else:
        # large family
        return 2

In [22]:
X['Family_type'] = X['Family_size'].apply(myfunc)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,22.0,1,0,1,0
1,1,38.0,1,0,1,0
2,3,26.0,0,0,0,2
3,1,35.0,1,0,1,0
4,3,35.0,0,0,0,2
...,...,...,...,...,...,...
885,3,39.0,0,5,5,2
886,2,27.0,0,0,0,2
887,1,19.0,0,0,0,2
889,1,26.0,0,0,0,2


In [23]:
X.drop(columns=['SibSp','Parch','Family_size'],inplace=True)
X

Unnamed: 0,Pclass,Age,Family_type
0,3,22.0,0
1,1,38.0,0
2,3,26.0,2
3,1,35.0,0
4,3,35.0,2
...,...,...,...
885,3,39.0,2
886,2,27.0,2
887,1,19.0,2
889,1,26.0,2


### After feature construction checking validation

In [24]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.7173015873015872

# Features Splitting

In [26]:
df = pd.read_csv('train.csv')
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [28]:
df['Name_Title'] = df['Name'].str.split(', ',expand=True)[1].str.split('.',expand=True)[0]
df[['Name','Name_Title']]

Unnamed: 0,Name,Name_Title
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr
...,...,...
886,"Montvila, Rev. Juozas",Rev
887,"Graham, Miss. Margaret Edith",Miss
888,"Johnston, Miss. Catherine Helen ""Carrie""",Miss
889,"Behr, Mr. Karl Howell",Mr


In [29]:
df[['Name','Name_Title']].isnull().mean()

Name          0.0
Name_Title    0.0
dtype: float64

In [30]:
df['Is_Married'] = 0
df['Is_Married'].loc[df['Name_Title'] == 'Mrs'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Is_Married'].loc[df['Name_Title'] == 'Mrs'] = 1


In [32]:
df['Name_Title'].value_counts()

Name_Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64