<a href="https://colab.research.google.com/github/haziranz/classification-DM-/blob/main/classification_based_on_simple_rule.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Standard Routines <a class="anchor" id="standard"></a>

For the sake of readability, we summarize all imports below.

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from fancyimpute import KNN
from fancyimpute import IterativeImputer
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings('ignore')



We start with loading the train and test sets and building the joint train+test set.

In [None]:
# Load data
train = pd.read_csv('/content/train.csv', header=0)
test = pd.read_csv('/content/test.csv', header=0)

# Merge train and test sets
test.insert(1,'Survived',np.nan)
all = pd.concat([train, test])

In [None]:
# Perform corrections
corr_dict = {248: pd.Series([0,1], index=['SibSp', 'Parch'],),
             313: pd.Series([1,0], index=['SibSp', 'Parch'],),
             418: pd.Series([0,0], index=['SibSp', 'Parch'],),
             756: pd.Series([0,1], index=['SibSp', 'Parch'],),
             1041: pd.Series([1,0], index=['SibSp', 'Parch'],),
             1130: pd.Series([0,0], index=['SibSp', 'Parch'],),
             1170: pd.Series([2,0], index=['SibSp', 'Parch'],),
             1254: pd.Series([1,0], index=['SibSp', 'Parch'],),
             1274: pd.Series([1,0], index=['SibSp', 'Parch'],),
             539: pd.Series([1,0], index=['SibSp', 'Parch'],)
             }

all[['SibSp','Parch']] = all.apply(lambda s: corr_dict[s['PassengerId']]
    if s['PassengerId'] in [248,313,418,756,1041,1130,1170,1254,1274,539] else s[['SibSp','Parch']], axis = 1)

# Feature Engineering <a class="anchor" id="feature"></a>

For the models based on simple rules, only few features necessary. However, for the derivation of the models, a larger set of features is considered.

## Title, Family Size, Group Size, is Alone, has Cabin <a class="anchor" id="feature-1"></a>

 we create the 'Title' feature.

In [None]:
# Add Title
all['Title'] =  all.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
# Replace rare titles
all.loc[all['Title'].isin(['Ms','Mlle']), 'Title'] = 'Miss'
all.loc[all['Title'].isin(['Mme','Lady','Dona','Countess']), 'Title'] = 'Mrs'
all.loc[all['Title'].isin(['Col','Major','Sir','Rev','Capt','Don','Jonkheer']), 'Title'] = 'Mr'
all.loc[(all['Title'] == 'Dr') & (all['Sex'] == 'male'),'Title'] = 'Mr'
all.loc[(all['Title'] == 'Dr') & (all['Sex'] == 'female'),'Title'] = 'Mrs'

We add the feature representing the size of the family. The 'isAlone' feature identifies whether the person is traveling alone.

In [None]:
# Add Family Size and is-Alone
all['FamSize'] = all.apply(lambda s: 1+s['SibSp']+s['Parch'], axis = 1)
all['isAlone'] = all.apply(lambda s: 1 if s['FamSize'] == 1 else 0, axis = 1)

feature representing the number of people sharing the same ticket and extract information about the availability of the cabin information.

In [None]:
# Add Group Size
ticket_counts = all['Ticket'].value_counts()
all['GrSize'] = all.apply(lambda s: ticket_counts.loc[s['Ticket']], axis=1)

# Add has-Cabin
all['Cabin'].fillna('U',inplace=True)
all['hasCabin'] = all.apply(lambda s: 0 if s['Cabin'] == 'U' else 1,axis = 1)

## with Siblings, with Spouse, with Children, with Parents <a class="anchor" id="feature-2"></a>

In [None]:
# Add Family Name
all['Fname'] =  all.Name.str.extract('^(.+?),', expand=False)

# Search for passengers with siblings
Pas_wSib = []
all_x_0 = all[(all['SibSp'] > 0) & (all['Parch'] == 0)]
name_counts_SibSp = all_x_0['Fname'].value_counts()
for label, value in name_counts_SibSp.items():
    entries = all_x_0[all_x_0['Fname'] == label]
    if (entries.shape[0] > 1 and (not (entries['Title'] == 'Mrs').any())) or \
       (entries.shape[0] == 1 and entries['Title'].values[0] == 'Mrs'):
            Pas_wSib.extend(entries['PassengerId'].values.tolist())
    else:
        Pas_wSib.extend( \
            entries[(entries['Title'] == 'Miss')|(entries['GrSize'] == 1)]['PassengerId'].values.tolist())

# Search for Mrs-es with parents
Mrs_wPar = []
all_x_y = all[all['Parch'] > 0]
name_counts_Parch = all_x_y['Fname'].value_counts()
for label, value in name_counts_Parch.items():
    entries = all_x_y[all_x_y['Fname'] == label]
    if entries.shape[0] == 1:
        if entries['Title'].values[0] == 'Mrs' and entries['Age'].values[0] <= 30:
            Mrs_wPar.extend(entries['PassengerId'].values.tolist())

def get_features(row):

    features = pd.Series(0, index = ['wSib','wSp','wCh','wPar'])

    if row['PassengerId'] in Pas_wSib:
        features['wSib'] = 1
    else:
        if (row['SibSp'] != 0) & (row['Parch'] == 0):
            features['wSp'] = 1
        else:
            if  ( (row['Title']=='Mrs')&(not row['PassengerId'] in Mrs_wPar) )| \
                ( (row['Title']=='Mr')&(not row['PassengerId'] == 680)&
                                        ( ((row['Pclass']==1)&(row['Age']>=30))|
                                          ((row['Pclass']==2)&(row['Age']>=25))|
                                          ((row['Pclass']==3)&(row['Age']>=20)) ) ):
                features['wCh'] = 1
            else:
                features['wPar'] = 1

    return features

all[['wSib','wSp','wCh','wPar']] = all.apply(lambda s: get_features(s) if s['isAlone'] == 0 else [0,0,0,0], axis = 1)

In [None]:
all = all.drop(['Fname','Name','Cabin','Ticket','Fare','SibSp','Parch'], axis = 1)

# Exploratory Data Analysis <a class="anchor" id="EDA"></a>

## Class 1 <a class="anchor" id="EDA-1"></a>

In [None]:
all[all['Pclass'] == 1].groupby(['Title','isAlone','wSib','wSp','wCh','wPar'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count,size,mean
Title,isAlone,wSib,wSp,wCh,wPar,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Master,0,0,0,0,1,3,5,1.0
Miss,0,0,0,0,1,16,19,0.9375
Miss,0,1,0,0,0,5,5,1.0
Miss,1,0,0,0,0,27,38,0.962963
Mr,0,0,0,0,1,7,8,0.428571
Mr,0,0,0,1,0,13,21,0.230769
Mr,0,0,1,0,0,24,36,0.458333
Mr,0,1,0,0,0,0,1,
Mr,1,0,0,0,0,75,108,0.333333
Mrs,0,0,0,0,1,0,3,


In [None]:
all[(all['Pclass'] == 1)&(all['Title'] == 'Mr') ].groupby(['hasCabin','isAlone','wSib','wSp','wCh','wPar'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count,size,mean
hasCabin,isAlone,wSib,wSp,wCh,wPar,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,0,0,1,0,1,
0,0,0,0,1,0,1,2,0.0
0,0,0,1,0,0,3,4,0.333333
0,1,0,0,0,0,23,37,0.217391
1,0,0,0,0,1,7,7,0.428571
1,0,0,0,1,0,12,19,0.25
1,0,0,1,0,0,21,32,0.47619
1,0,1,0,0,0,0,1,
1,1,0,0,0,0,52,71,0.384615


## Class 2 <a class="anchor" id="EDA-2"></a>

In [None]:
all[all['Pclass'] == 2].groupby(['Title','isAlone','wSib','wSp','wCh','wPar'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count,size,mean
Title,isAlone,wSib,wSp,wCh,wPar,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Master,0,0,0,0,1,9,11,1.0
Miss,0,0,0,0,1,12,20,1.0
Miss,0,1,0,0,0,1,3,1.0
Miss,1,0,0,0,0,22,28,0.909091
Mr,0,0,0,0,1,2,2,0.0
Mr,0,0,0,1,0,8,12,0.0
Mr,0,0,1,0,0,11,19,0.090909
Mr,0,1,0,0,0,6,11,0.0
Mr,1,0,0,0,0,72,116,0.097222
Mrs,0,0,0,0,1,2,2,1.0


## Class 3 <a class="anchor" id="EDA-3"></a>

In [None]:
all[all['Pclass'] == 3].groupby(['Title','isAlone','wSib','wSp','wCh','wPar'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count,size,mean
Title,isAlone,wSib,wSp,wCh,wPar,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Master,0,0,0,0,1,27,43,0.37037
Master,0,1,0,0,0,1,1,1.0
Master,1,0,0,0,0,0,1,
Miss,0,0,0,0,1,35,45,0.342857
Miss,0,1,0,0,0,12,16,0.5
Miss,1,0,0,0,0,55,90,0.6
Mr,0,0,0,0,1,13,15,0.0
Mr,0,0,0,1,0,11,16,0.090909
Mr,0,0,1,0,0,12,19,0.0
Mr,0,1,0,0,0,20,28,0.15


In [None]:
all[(all['Pclass'] == 3)&(all['Title'] != 'Mr')].groupby(['Title','FamSize'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,size,mean
Title,FamSize,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Master,1,0,1,
Master,2,3,5,1.0
Master,3,6,14,1.0
Master,4,1,2,1.0
Master,5,2,3,0.0
Master,6,9,10,0.0
Master,7,3,5,0.333333
Master,8,3,3,0.0
Master,11,1,2,0.0
Miss,1,55,90,0.6


In [None]:
# Make FamSize bins
all['FamSizeBin'] = pd.cut(all['FamSize'], bins = [0,4,11], labels = False)
all = all.drop(['FamSize'], axis = 1)

In [None]:
all[(all['Pclass'] == 3)&(all['Title'] != 'Mr')].groupby(['Title','FamSizeBin','isAlone','wSib','wSp','wCh','wPar'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,count,size,mean
Title,FamSizeBin,isAlone,wSib,wSp,wCh,wPar,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Master,0,0,0,0,0,1,9,20,1.0
Master,0,0,1,0,0,0,1,1,1.0
Master,0,1,0,0,0,0,0,1,
Master,1,0,0,0,0,1,18,23,0.055556
Miss,0,0,0,0,0,1,16,23,0.625
Miss,0,0,1,0,0,0,12,16,0.5
Miss,0,1,0,0,0,0,55,90,0.6
Miss,1,0,0,0,0,1,19,22,0.105263
Mrs,0,0,0,0,1,0,16,26,0.5625
Mrs,0,0,0,1,0,0,12,18,0.583333


## Models Based on Simple Rules  <a class="anchor" id="models"></a>

## Model 1 <a class="anchor" id="model-1"></a>

<b>Model 1.</b>
<cite> All adult males are deemed to perish as well as the ones in class 3 with families greater than 4. The rest all survive. </cite>

In [None]:
def get_survived_1(row):
    if row['Pclass'] in [1,2]:
        if row['Title'] == 'Mr':
            survived = 0
        else:
            survived = 1
    else:
        if row['Title'] == 'Mr' or row['FamSizeBin'] == 1:
            survived = 0
        else:
            survived = 1

    return survived

Let us apply our model to the train and test sets and see the accuracy.

In [None]:
# Form train and test sets
X_train = all.iloc[:891,:]
X_test = all.iloc[891:,:]
y_train = all.iloc[:891,:]['Survived']

# Make predictions (train)
y_train_hat = X_train.apply(lambda s: get_survived_1(s), axis = 1)

# Make predictions (test)
predictions = pd.DataFrame( {'PassengerId': test['PassengerId'], 'Survived': 0} )
predictions['Survived'] = X_test.apply(lambda s: get_survived_1(s), axis = 1)
predictions.to_csv('submission-1.csv', index=False)

# Train score
score = metrics.accuracy_score(y_train_hat, y_train)
print('Train Accuracy: {}'.format(score))

Train Accuracy: 0.835016835016835


## Model 2 <a class="anchor" id="model-2"></a>

In [None]:
all[(all['Pclass'] == 3)&(all['Title'] != 'Mr')&(all['FamSizeBin'] == 0)].groupby(['Title','Embarked'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,size,mean
Title,Embarked,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Master,C,4,8,1.0
Master,S,6,14,1.0
Miss,C,14,15,0.642857
Miss,Q,30,53,0.766667
Miss,S,39,61,0.435897
Mrs,C,9,16,0.666667
Mrs,Q,2,2,0.5
Mrs,S,23,37,0.565217


<b>Model 2.</b>
<cite> All adult males are deemed to perish as well as the ones in class 3 with families greater than 4. Also, Misses in class 3 embarked in S perish. The rest all survive. </cite>

In [None]:
def get_survived_2(row):
    if row['Pclass'] in [1,2]:
        if row['Title'] == 'Mr':
            survived = 0
        else:
            survived = 1
    else:
        if row['Title'] == 'Mr' or row['FamSizeBin'] == 1 or (row['Title'] == 'Miss' and row['Embarked'] == 'S'):
            survived = 0
        else:
            survived = 1

    return survived

In [None]:
# Make predictions (train)
y_train_hat = X_train.apply(lambda s: get_survived_2(s), axis = 1)

# Make predictions (test)
predictions['Survived'] = X_test.apply(lambda s: get_survived_2(s), axis = 1)
predictions.to_csv('submission-2.csv', index=False)

# Train score
score = metrics.accuracy_score(y_train_hat, y_train)
print('Train Accuracy: {}'.format(score))

Train Accuracy: 0.8406285072951739


## Model 3 <a class="anchor" id="model-3"></a>

In [None]:
all[(all['Pclass'] == 3)&(all['Title'] == 'Miss')&(all['FamSizeBin'] == 0)].groupby(['Title','wPar','Embarked'])['Survived'].agg(['count','size','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,size,mean
Title,wPar,Embarked,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Miss,0,C,6,6,0.5
Miss,0,Q,29,52,0.793103
Miss,0,S,32,48,0.40625
Miss,1,C,8,9,0.75
Miss,1,Q,1,1,0.0
Miss,1,S,7,13,0.571429


<b>Model 3.</b>
<cite> All adult males are deemed to perish as well as the ones in class 3 with families greater than 4. Also, Misses in class 3, non-chlidren and embarked in S perish. The rest all survive. </cite>

In [None]:
def get_survived_3(row):
    if row['Pclass'] in [1,2]:
        if row['Title'] == 'Mr':
            survived = 0
        else:
            survived = 1
    else:
        if row['Title'] == 'Mr' or row['FamSizeBin'] == 1 or \
        (row['Title'] == 'Miss' and row['Embarked'] == 'S' and row['wPar'] == 0):
            survived = 0
        else:
            survived = 1

    return survived

In [None]:
# Make predictions (train)
y_train_hat = X_train.apply(lambda s: get_survived_3(s), axis = 1)

# Make predictions (test)
predictions['Survived'] = X_test.apply(lambda s: get_survived_3(s), axis = 1)
predictions.to_csv('submission-3.csv', index=False)

# Train score
score = metrics.accuracy_score(y_train_hat, y_train)
print('Train Accuracy: {}'.format(score))

Train Accuracy: 0.8417508417508418


## Data Preparation <a class="anchor" id="SVM-1"></a>

In [None]:
# Select and convert categorical features into numerical ones (1)
all['Sex'] = all['Sex'].map( {'male': 0, 'female': 1} ).astype(int)
all['Embarked'].fillna(all['Embarked'].value_counts().index[0], inplace=True)
all_dummies =  pd.get_dummies(all, columns = ['Title','Pclass','Embarked'],\
                                 prefix=['Title','Pclass','Embarked'], drop_first = True)
all_dummies = all_dummies.drop(['PassengerId','Survived'], axis = 1)

 KNN age imputation.

In [None]:
# KNN imputation
all_dummies_i = pd.DataFrame(data=KNN(k=3, verbose = False).fit_transform(all_dummies).astype(int),
                            columns=all_dummies.columns, index=all_dummies.index)

In [None]:
# Convert categorical features into numerical ones (2)
all_dummies_i['isAlwSib'] = all_dummies_i.apply(lambda s: 1 if (s['isAlone'] == 1)|(s['wSib'] == 1) else 0 ,axis = 1)
all_dummies_i = all_dummies_i.drop(['isAlone','wSib','Sex','GrSize'], axis = 1)

 First,  re-build train and test sets.

In [None]:
# Form train and test sets
X_train = all_dummies_i.iloc[:891,:]
X_test = all_dummies_i.iloc[891:,:]

 scaling based on the train set and apply it to both train and test sets.

In [None]:
# Perform scaling
scaler = StandardScaler()
scaler.fit(X_train[['Age']])
X_train['Age'] = scaler.transform(X_train[['Age']])
X_test['Age'] = scaler.transform(X_test[['Age']])

## Training <a class="anchor" id="SVM-2"></a>

Now, we are ready to apply SVM. First, let us define the cross-validation strategy. We form for 80/20 percent train/test splits, in total 10 times. 

In [None]:
# Cross-validation parameters
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1)

To find the parameter of SVM (regularization factor 'C'), we use an exhaustive grid search.

In [None]:
# Grid search parameters
svm_grid = {'C': [10,11,12,13,14,15,16,17,18,19,20], 'gamma': ['auto']}
svm_search = GridSearchCV(estimator = SVC(), param_grid = svm_grid, cv = cv, refit=True, n_jobs=1)

In [None]:
# Apply grid search
svm_search.fit(X_train, train['Survived'])
svm_best = svm_search.best_estimator_
print("Cross-validation accuracy: {}, standard deviation: {}, with parameters {}"
       .format(svm_search.best_score_, svm_search.cv_results_['std_test_score'][svm_search.best_index_],
               svm_search.best_params_))

Cross-validation accuracy: 0.8289824869750801, standard deviation: 0.027046317052630685, with parameters {'C': 10, 'gamma': 'auto'}


predictions 

In [None]:
y_train_hat = svm_best.predict(X_train)
print('Train Accuracy: {}'
        .format(metrics.accuracy_score(y_train_hat, y_train)))

predictions['Survived'] = svm_best.predict(X_test)
predictions.to_csv('submission-svm.csv', index=False)

Train Accuracy: 0.8428731762065096


## Relationship to Model 3 <a class="anchor" id="SVM-3"></a>

<b>SVM rule.</b>
<cite> All adult males are deemed to perish as well as the ones in class 3 with families greater than 4. Also, 18 and older Misses in class 3 embarked in S perish. The rest all survive. </cite>

In [None]:
def get_survived_svm_rule(row):
    if row['Pclass'] in [1,2]:
        if row['Title'] == 'Mr':
            survived = 0
        else:
            survived = 1
    else:
        if row['Title'] == 'Mr' or row['FamSizeBin'] == 1 or \
        (row['Title'] == 'Miss' and row['Embarked'] == 'S' and row['Age'] >= 18):
            survived = 0
        else:
            survived = 1

    return survived