## 1.Data Cleaning



In [2]:
import re
import numpy as np
import pandas as pd

In [3]:
URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
# Loading data from specific url
df = pd.read_csv(URL)

In [4]:
# Uncovering missing data
df.replace('?', np.nan, inplace=True)
df['age'] = df['age'].astype('float')
df['fare'] = df['fare'].astype('float')

In [5]:
# helper function 1
def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan
# Keep only one cabin
df['cabin'] = df['cabin'].apply(get_first_cabin)

In [6]:
# helper function 2
def get_title(passenger):
    line = passenger
    if re.search('Mrs', line):
        return 'Mrs'
    elif re.search('Mr', line):
        return 'Mr'
    elif re.search('Miss', line):
        return 'Miss'
    elif re.search('Master', line):
        return 'Master'
    else:
        return 'Other'
# Extract the title from 'name'
df['title'] = df['name'].apply(get_title)

In [7]:
# Droping irrelevant columns
DROP_COLS = ['boat','body','home.dest','ticket','name']
df.drop(DROP_COLS, axis=1, inplace=True)

In [8]:
target = 'survived'
num_vars = [col for col in df.columns if df[col].dtype != object and col != target]
cat_vars = [col for col in df.columns if df[col].dtype == object and col != target]

## **2. Data Segregation API**

In here, the (reproducible) train/test split occurs and both data sets are stored. The reason to persist these data is the following: in the future, the <ins>Model Evaluation Service</ins> will send a request (to the <ins>Data Segregation API</ins>) to get the test set and apply the appropriate metrics for each mode in the <ins>Model Candidate Data Store</ins>.

In [9]:
from sklearn.model_selection import train_test_split

SEED_SPLIT = 404

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(target, axis=1), df[target], test_size=0.2,
    random_state=SEED_SPLIT)

TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE  = 'test.csv'

X_train.to_csv(TRAIN_DATA_FILE, index=False)
X_test.to_csv(TEST_DATA_FILE, index=False)

X_train.shape, X_test.shape


((1047, 9), (262, 9))

In [10]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,title
1162,3,male,,0,0,7.7500,,Q,Mr
899,3,female,27.0,0,2,11.1333,,S,Mrs
1006,3,female,,0,0,7.8792,,Q,Miss
228,1,male,18.0,1,0,108.9000,C65,C,Mr
573,2,female,27.0,0,0,10.5000,E101,S,Miss
...,...,...,...,...,...,...,...,...,...
71,1,male,27.0,1,0,136.7792,C89,C,Mr
609,3,male,26.0,0,0,8.0500,,S,Mr
625,3,female,17.0,4,2,7.9250,,S,Miss
1012,3,female,,0,0,7.7500,,Q,Miss


## **3. Feature engineering & Training pipeline**

- In this part, we divide the features into two groups: features without persisting information and features with persisting information. The difference is that in the latter, we learn certain values from the train set in order to be applied to the test set.
- In this case, we decided not to store the individual features into a <ins>Feature Data Store</ins> due to the simple nature of our example.

### **3.1. Without persisting information**

#### **Numerical variables**

- Create missing value indicator: only for numeric variables

In [11]:
def missing_indicator(data, col_name):
    data[col_name+'_nan'] = data[col_name].isnull().astype(int)


In [12]:
for var in num_vars:
    missing_indicator(X_train, var)
    missing_indicator(X_test, var)

In [13]:
X_train.cabin.value_counts()

F      7
C23    5
F33    4
F4     4
C78    4
      ..
A19    1
A36    1
B86    1
B80    1
B22    1
Name: cabin, Length: 155, dtype: int64

#### **Categorical variables**

We are only interested in the letter of the cabin, let's get it

In [14]:
def extract_letter_from_cabin(x):
    if type(x)==str:    
        return ''.join(re.findall("[a-zA-Z]+", x))  
    return x
        
X_train['cabin'] = X_train['cabin'].apply(extract_letter_from_cabin)
X_test['cabin'] = X_test['cabin'].apply(extract_letter_from_cabin)


In [15]:
X_train['cabin'].unique(), X_test['cabin'].unique()

(array([nan, 'C', 'E', 'B', 'F', 'D', 'A', 'G'], dtype=object),
 array(['D', nan, 'A', 'B', 'C', 'E', 'G', 'T', 'F'], dtype=object))

**Remark:** Notice that we have a cabin in the test set that was not observed in the train set.

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List

class CategoricalImputerEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables: List[str]):   
        self.variables = variables
    
    def fit(self, X: pd.DataFrame):
        return self
    
    def transform(self, X: pd.DataFrame):
        X[self.variables] = X[self.variables].fillna('missing')
        return X


In [17]:
categ_imputer = CategoricalImputerEncoder(variables=cat_vars)
X_train = categ_imputer.transform(X_train)
X_test = categ_imputer.transform(X_test)

In [18]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,title,pclass_nan,age_nan,sibsp_nan,parch_nan,fare_nan
1162,3,male,,0,0,7.75,missing,Q,Mr,0,1,0,0,0
899,3,female,27.0,0,2,11.1333,missing,S,Mrs,0,0,0,0,0
1006,3,female,,0,0,7.8792,missing,Q,Miss,0,1,0,0,0
228,1,male,18.0,1,0,108.9,C,C,Mr,0,0,0,0,0
573,2,female,27.0,0,0,10.5,E,S,Miss,0,0,0,0,0


### **3.2. With persisting information**

#### **3.2.1 Categorical variables**

- Remove rare labels
- One hot encoding
- Fix one-hot-encoded features not in test set

##### Rare Label Encoding

In [19]:
X_train.cabin.value_counts()

missing    810
C           74
B           53
E           36
D           35
F           20
A           16
G            3
Name: cabin, dtype: int64

In [20]:
class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, tol=0.02, variables: List[str] = None):
        self.tol =tol
        self.variables = variables
    
    def fit(self, X: pd.DataFrame):
        self.valid_labels_dict = {}
        for var in self.variables:
            t = X[var].value_counts() / X.shape[0]
            self.valid_labels_dict[var] = t[t>self.tol].index.tolist()

    def transform(self, X:pd.DataFrame):
        for var in self.variables:
            tmp = [col for col in X[var].unique() if col not in self.valid_labels_dict[var]]
            X[var] = X[var].replace(to_replace=tmp, value=len(tmp) * ['Rare'])
        return X

In [21]:
rare_labels = RareLabelCategoricalEncoder(tol=0.02, variables=cat_vars)
rare_labels.fit(X_train)
X_train = rare_labels.transform(X_train)
X_test  = rare_labels.transform(X_test)

In [22]:
X_train.cabin.value_counts()

missing    810
C           74
B           53
Rare        39
E           36
D           35
Name: cabin, dtype: int64

##### One Hot Encoding

In [23]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', drop='first')
enc.fit(X_train[cat_vars])
X_train[enc.get_feature_names_out(cat_vars)] = enc.transform(X_train[cat_vars]).toarray()
X_test[enc.get_feature_names_out(cat_vars)] = enc.transform(X_test[cat_vars]).toarray()

X_train.drop(cat_vars, axis=1, inplace=True)
X_test.drop(cat_vars, axis=1, inplace=True)

In [24]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047 entries, 1162 to 1206
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pclass         1047 non-null   int64  
 1   age            827 non-null    float64
 2   sibsp          1047 non-null   int64  
 3   parch          1047 non-null   int64  
 4   fare           1046 non-null   float64
 5   pclass_nan     1047 non-null   int32  
 6   age_nan        1047 non-null   int32  
 7   sibsp_nan      1047 non-null   int32  
 8   parch_nan      1047 non-null   int32  
 9   fare_nan       1047 non-null   int32  
 10  sex_male       1047 non-null   float64
 11  cabin_C        1047 non-null   float64
 12  cabin_D        1047 non-null   float64
 13  cabin_E        1047 non-null   float64
 14  cabin_Rare     1047 non-null   float64
 15  cabin_missing  1047 non-null   float64
 16  embarked_Q     1047 non-null   float64
 17  embarked_Rare  1047 non-null   float64
 18  embar

#### 3.2.2 Numerical Variables


##### Imputation
- Fill NaN with median

In [87]:
from sklearn.impute import SimpleImputer

imp_median = SimpleImputer(strategy='median')
imp_median.fit(X_train[num_vars])

SimpleImputer(strategy='median')

In [88]:
imp_median.statistics_

array([ 3.    , 28.    ,  0.    ,  0.    , 14.4542])

In [89]:
X_train[num_vars] = imp_median.transform(X_train[num_vars])
X_test[num_vars]  = imp_median.transform(X_test[num_vars])

In [90]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047 entries, 1162 to 1206
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pclass         1047 non-null   float64
 1   age            1047 non-null   float64
 2   sibsp          1047 non-null   float64
 3   parch          1047 non-null   float64
 4   fare           1047 non-null   float64
 5   pclass_nan     1047 non-null   int32  
 6   age_nan        1047 non-null   int32  
 7   sibsp_nan      1047 non-null   int32  
 8   parch_nan      1047 non-null   int32  
 9   fare_nan       1047 non-null   int32  
 10  sex_male       1047 non-null   float64
 11  cabin_C        1047 non-null   float64
 12  cabin_D        1047 non-null   float64
 13  cabin_E        1047 non-null   float64
 14  cabin_Rare     1047 non-null   float64
 15  cabin_missing  1047 non-null   float64
 16  embarked_Q     1047 non-null   float64
 17  embarked_Rare  1047 non-null   float64
 18  embar

#####**Normalization with MinMaxScaler**

In [91]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

## **4. Training a model**

### Training

In [92]:
from sklearn.linear_model import LogisticRegression

SEED_MODEL = 404

model = LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL)
model.fit(X_train_scaled, y_train)

LogisticRegression(C=0.0005, class_weight='balanced', random_state=404)

**Remark:** After training, the model should be stored in the <ins>Model Candidate Data Store</ins>

### Evaluation

In [93]:
# This part simulates the call of the Model Evaluation Service to the 
# Data Segregation API to obtain the test set and evaluate the model

from sklearn.metrics import accuracy_score, roc_auc_score

for s,t in zip(['train','test'],[(X_train_scaled, y_train),(X_test_scaled,y_test)]):
    x,y = t[0], t[1]
    class_pred = model.predict(x)
    proba_pred = model.predict_proba(x)[:,1]
    print('{} roc-auc : {}'.format(s, roc_auc_score(y, proba_pred)))
    print('{} accuracy: {}'.format(s, accuracy_score(y, class_pred)))
    print()

train roc-auc : 0.8503119550474714
train accuracy: 0.7879656160458453

test roc-auc : 0.8134273676497528
test accuracy: 0.7709923664122137



**Remark:** These metrics (joint with the initial configuration, learnt parameters, metadata of the training set, training times, etc) should be stored in a repository

### Predictions

Now, we simulate the <ins>Scoring Service</ins> which generates the predictions and store them in the <ins>Score Data Store</ins> and send them to the customer.

In [94]:
tmp = pd.DataFrame(X_test_scaled, columns=X_train.columns)
tmp['y_true'] = np.array(y_test)
tmp['y_pred'] = model.predict(X_test)
tmp['proba_pred'] = model.predict_proba(X_test)[:,1]

tmp.head(10)



Unnamed: 0,pclass,age,sibsp,parch,fare,pclass_nan,age_nan,sibsp_nan,parch_nan,fare_nan,...,embarked_Q,embarked_Rare,embarked_S,title_Miss,title_Mr,title_Mrs,title_Other,y_true,y_pred,proba_pred
0,0.0,0.724426,0.0,0.222222,0.221098,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0.641503
1,0.5,0.386221,0.125,0.111111,0.051237,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.496349
2,1.0,0.223382,0.0,0.0,0.015379,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,0.507254
3,0.5,0.423799,0.125,0.0,0.040989,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.486633
4,0.5,0.48643,0.0,0.0,0.050749,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.492885
5,1.0,0.298538,0.0,0.0,0.01394,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.460236
6,0.5,0.160751,0.0,0.111111,0.038061,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,1,0.532912
7,0.0,0.611691,0.125,0.0,0.111118,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0.558536
8,0.0,0.398747,0.0,0.0,0.148911,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,0.63345
9,0.0,0.26096,0.25,0.222222,0.512122,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,0.848197
