In [24]:
import pandas as pd
import numpy as np
import seaborn as sns

# logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# for feature engineering
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler

# Importing and preprocessing of the data sets
(required for Feature Engineering)

In [25]:
df = pd.read_csv('../data/train.csv', index_col=0)

df_test = pd.read_csv('../data/test.csv', index_col=0)

## Cleaning training data

In [26]:
titles = df['Name'].str.extract(',\s(\w+)\.')
titles.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Capt          1
Don           1
Jonkheer      1
Lady          1
Mme           1
Ms            1
Sir           1
dtype: int64

Because there are a lot of different titles, I want to reduce them to five. This includes Mr, Miss, Mrs and Master, since there are already a lot observations. This means I have to match the other titles into the existing groups. I map them like this:
* `Mlle` and `Ms` to `Miss` 
* `Mme` to `Mrs`
* `Dr`, `Rev`, `Col`, `Major`, `Capt`, `Jonkheer`, `Lady` and `Sir` to `rest`

In [27]:
df['title'] = titles.replace({
     'Mlle': 'Miss', 
     'Ms': 'Miss', 
     'Mme': 'Mrs', 
     'Dr': 'rest', 
     'Don': 'rest',
     'Dona': 'rest', # found in test.csv
     'Rev': 'rest', 
     'Col': 'rest', 
     'Major': 'rest', 
     'Capt': 'rest', 
     'Jonkheer': 'rest', 
     'Lady': 'rest', 
     'Sir': 'rest'
})
df['title'].iloc[759] = 'rest'  # regex doenst find that title :(
df['title'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Mr        517
Miss      185
Mrs       126
Master     40
rest       23
Name: title, dtype: int64

In [28]:
df['family_size'] = df['SibSp'] + df['Parch']
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0


## Cleaning of test data

In [29]:
titles_test = df_test['Name'].str.extract(',\s(\w+)\.')
titles_test.value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dona        1
Dr          1
Ms          1
dtype: int64

In [37]:
df_test['title'] = titles_test.replace({
     'Mlle': 'Miss', 
     'Ms': 'Miss', 
     'Mme': 'Mrs', 
     'Dr': 'rest', 
     'Don': 'rest',
     'Dona': 'rest', # found in test.csv
     'Rev': 'rest', 
     'Col': 'rest', 
     'Major': 'rest', 
     'Capt': 'rest', 
     'Jonkheer': 'rest', 
     'Lady': 'rest', 
     'Sir': 'rest'
})
df_test['title'].value_counts()

Mr        240
Miss       79
Mrs        72
Master     21
rest        6
Name: title, dtype: int64

In [38]:
df_test['family_size'] = df_test['SibSp'] + df_test['Parch']
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0


# Feature Engineering

In [39]:
X_train = df[['Pclass', 'Sex', 'Age', 'Fare', 'title', 'family_size']]
y_train = df['Survived']
X_test = df_test[['Pclass', 'Sex', 'Age', 'Fare', 'title', 'family_size']]

In [40]:
impute_and_bin_age = make_pipeline(
    SimpleImputer(strategy = 'mean'),
    KBinsDiscretizer(n_bins = 5, encode = 'onehot-dense', strategy = 'quantile')
)

impute_and_scale_fare = make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    MinMaxScaler()
)

In [41]:
feat_eng = ColumnTransformer(
    transformers = [
        ('age_transformation', impute_and_bin_age, ['Age']),
        ('familiy_scale', MinMaxScaler(), ['family_size']),
        ('hot_titles', OneHotEncoder(), ['title', 'Sex']),
        ('impute_fare', impute_and_scale_fare, ['Fare'])
    ],
    remainder = 'passthrough'
)

In [42]:
print(X_train.isna().any())
print(X_test.isna().any())

Pclass         False
Sex            False
Age             True
Fare           False
title          False
family_size    False
dtype: bool
Pclass         False
Sex            False
Age             True
Fare            True
title          False
family_size    False
dtype: bool


In [44]:
feat_eng.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('age_transformation',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('kbinsdiscretizer',
                                                  KBinsDiscretizer(encode='onehot-dense'))]),
                                 ['Age']),
                                ('familiy_scale', MinMaxScaler(),
                                 ['family_size']),
                                ('hot_titles', OneHotEncoder(),
                                 ['title', 'Sex']),
                                ('impute_fare',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinM

In [45]:
X_train_trans = feat_eng.transform(X_train)

In [46]:
X_test_trans = feat_eng.transform(X_test)

In [47]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train_trans, y_train)

y_pred = model.predict(X_test_trans)

In [54]:
X_test['Survived'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Survived'] = y_pred


In [56]:
X_test['Survived'].to_csv('../data/my_predictions.csv')