In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("Shape of training sample: ",train_df.shape)
print("Shape of testing sample: ",test_df.shape)

Shape of training sample:  (891, 12)
Shape of testing sample:  (418, 11)


# Exploration

In [None]:
train_df.columns

In [None]:
seaborn.pairplot(train_df)

In [None]:
train_df.head()

In [None]:
# Any nulls?
train_df.isna().sum(), test_df.isna().sum()

In [None]:
# Ticket / Name / PassengerId seems to be ver sparse - how many uniques we have?
print(train_df['Ticket'].nunique(), ' out of ', train_df.shape[0])
print(train_df['PassengerId'].nunique(), ' out of ', train_df.shape[0])
print(train_df['Name'].nunique(), ' out of ', train_df.shape[0])
print(train_df['Cabin'].nunique(), ' out of ', train_df.shape[0])

In [None]:
# Let's have a look at the other categorical features
categorical_columns = ['Survived', 'Pclass', 'Sex', 'Embarked']

for c in categorical_columns:
    print("Feature: ", c)
    print("Train: ")
    print(train_df[c].value_counts())
    if c is not 'Survived':
        print("Test: ")
        print(test_df[c].value_counts())

In [None]:
# Let's look at age, the text on Kaggle is a bit ambiguous about what age<1 means
train_df['Age'].plot(kind='hist')

In [None]:
# It looks okay, just a few babies it seems to me
train_df[train_df['Age']<=1]['Age'].plot(kind='hist')

In [None]:
# The text on Kaggle also mentions estimated ages have xx.5, how many are those?
print(train_df[(train_df['Age']-np.floor(train_df['Age']))==0.5].shape[0], ' out of ', train_df.shape[0])

# Naive first approach, RF CV with very simple features

In [None]:
# Dropping columns that are very sparse or have a lot of nulls
X = train_df.copy().drop('Survived',axis=1)
y = np.array(train_df['Survived']).ravel()
X_test = test_df.copy()

cat_columns = ['Embarked','Pclass','Sex']
cont_columns = ['Age','SibSp','Parch','Fare']
drop_columns = [x for x in X.columns if x not in (cat_columns + cont_columns)]
print('Dropping: ',drop_columns)

In [None]:
X.columns

In [None]:
X.drop(drop_columns,axis=1,inplace=True)
X_test.drop(drop_columns,axis=1,inplace=True)

In [None]:
# Imputing missing age with median, missing 'Embarked' with mode
X = X.fillna({'Age' : X['Age'].median(), 'Embarked' : X['Embarked'].mode()[0]})
X_test = X_test.fillna({'Age' : X['Age'].median(), 'Fare' : X['Fare'].median()})

In [None]:
# OHE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
[('bla',OneHotEncoder(categories='auto'),cat_columns)],remainder='passthrough')

X_ohe = ct.fit_transform(X)
X_test_ohe = ct.transform(X_test)

In [None]:
# Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# param grid
param_grid = {
    'n_estimators': [2,4,8,16,32,64,128],
    'max_depth' : [200,100,50,20],
    'criterion' : ['entropy'],
    'min_samples_split' : [0.1,0.3,0.5,0.75,1.0],
    'min_samples_leaf' : np.linspace(0.1,0.5,num=5),
    'max_features' : np.linspace(0.5,1.0,num=5+1)
}

rf = RandomForestClassifier()

gscv = GridSearchCV(rf,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1,verbose=10)

result = gscv.fit(X=X_ohe,y=y)
print(result.best_params_)
print(result.best_score_)

In [None]:
result.best_score_

In [None]:
best_rf_model = result.best_estimator_
best = best_rf_model.fit(X_ohe,y)
predictions = best.predict(X_test_ohe)

In [None]:
# First set of predictions - not very good - 0.76555 on public leaderboard

In [None]:
X_ohe.shape

In [None]:
# as you can see, mostof the featuers are ignored, we have to do some more feature engineering
best_rf_model.feature_importances_

In [None]:
# but it seems some of the other features are relevant but they are not included in the feature importances...
train_df.groupby('Embarked').Survived.mean()

In [None]:
## Other ideas to try
# Xgboost or something similar
# Interactions
# Extract more information from features not used now or with 0 importance
# Different imputations for age, Can we infer age from name?
# Kid with parent?
# Lived in cabin? (cabin not none)
# cabin location?

# More features & smarter imputation

### Missing values

In [3]:
# For missing values we consider both train and test to not bias
df_all = pd.concat([train_df,test_df])

In [4]:
# Missing values - Age
print(df_all.corrwith(df_all['Age']))

# Age is very correlated with pclass, so imputing with class averages
avg_age = df_all.groupby('Pclass')['Age'].median()
print('Average age per Pclass: ')
print(avg_age)

df_all['Age'] = np.where( (df_all['Age'].isnull()) & (df_all['Pclass']==1), avg_age[1],
                         np.where((df_all['Age'].isnull()) & (df_all['Pclass']==2), avg_age[2],
                                 np.where((df_all['Age'].isnull()) & (df_all['Pclass']==3), avg_age[3],df_all['Age'])))

Age            1.000000
Fare           0.178740
Parch         -0.150917
PassengerId    0.028814
Pclass        -0.408106
SibSp         -0.243699
Survived      -0.077221
dtype: float64
Average age per Pclass: 
Pclass
1    39.0
2    29.0
3    24.0
Name: Age, dtype: float64


In [5]:
df_all.columns

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket'],
      dtype='object')

In [6]:
# Missing values - Embarked
# Taking simple mode as only 2 are missing
df_all = df_all.fillna({'Embarked' : df_all['Embarked'].mode()[0]})

In [7]:
# Missing values - Fare
print(df_all[df_all['Fare'].isnull()])

# Only one, impute with average male fare with pclass 3
df_all = df_all.fillna({'Fare': df_all[ (df_all.Sex=='male') & (df_all.Pclass==3) ]['Fare'].median()})

      Age Cabin Embarked  Fare                Name  Parch  PassengerId  \
152  60.5   NaN        S   NaN  Storey, Mr. Thomas      0         1044   

     Pclass   Sex  SibSp  Survived Ticket  
152       3  male      0       NaN   3701  


In [8]:
# Missing values - Cabin
# Very difficult to figure out - treating missing values as a separate category
df_all['Cabin'] = df_all['Cabin'].apply(lambda x: str(x)[0])

In [9]:
# All good!
df_all.isnull().sum()

Age              0
Cabin            0
Embarked         0
Fare             0
Name             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived       418
Ticket           0
dtype: int64

### New features

In [10]:
df_all['Family_size'] = 1 + df_all['SibSp'] + df_all['Parch']
# Family sizes
print(df_all['Family_size'].value_counts().sort_index())

# Mapping for new feature
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Medium', 5: 'Medium', 6: 'Large', 7: 'Large', 8: 'Large', 11: 'Large'}
df_all['Family_size'] = df_all['Family_size'].map(family_map)

1     790
2     235
3     159
4      43
5      22
6      25
7      16
8       8
11     11
Name: Family_size, dtype: int64


In [11]:
# Married woman
df_all['Has_husband'] = np.where((df_all['Name'].str.contains('Mrs')) & (df_all['Sex']=='female') & (df_all['SibSp']==1),1,0)

In [12]:
# Titles
df_all['Title'] = df_all['Name'].str.split(',').apply(lambda x: x[1]).str.split(' ').apply(lambda x:x[1][:-1])
#df_all['Title'].value_counts()

# Map weird titles
df_all['Title'] = df_all['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df_all['Title'] = df_all['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev', 'th'], 'Dr/Military/Noble/Clergy')

df_all['Title'].value_counts()

Mr                          757
Miss/Mrs/Ms                 464
Master                       61
Dr/Military/Noble/Clergy     27
Name: Title, dtype: int64

### Encoding

In [21]:
df_all.columns

df_train = df_all[~df_all.Survived.isnull()]
df_test = df_all[df_all.Survived.isnull()]

In [22]:
cat_columns = ['Embarked','Pclass','Sex','Cabin','Family_size','Has_husband','Title']
cont_columns = ['Age','Fare']

In [23]:
from sklearn.preprocessing import OneHotEncoder

encoded_features = []

# Some ugly code to join transformed features back
# in readable format to train and test df
for df in [df_train,df_test]:
    for feature in cat_columns:
        encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
        n = df[feature].nunique()
        cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
        
        encoded_df = pd.DataFrame(encoded_feat, columns=cols)
        encoded_df.index = df.index
        encoded_features.append(encoded_df)

df_train = pd.concat([df_train, *encoded_features[:len(cat_columns)]], axis=1)
df_test = pd.concat([df_test, *encoded_features[len(cat_columns):]], axis=1)

In [24]:
keep_columns = (['Age','Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1',
       'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7',
       'Cabin_8', 'Cabin_9', 'Family_size_1', 'Family_size_2', 'Family_size_3',
       'Family_size_4', 'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3', 'Title_4', 'Survived'])
keep_columns.remove('Cabin_9')
df_train = df_train[keep_columns]
keep_columns.remove('Survived')
df_test = df_test[keep_columns]

In [25]:
df_train.columns

Index(['Age', 'Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1', 'Cabin_2',
       'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7', 'Cabin_8',
       'Family_size_1', 'Family_size_2', 'Family_size_3', 'Family_size_4',
       'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3',
       'Title_4', 'Survived'],
      dtype='object')

In [26]:
df_test.columns

Index(['Age', 'Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1', 'Cabin_2',
       'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7', 'Cabin_8',
       'Family_size_1', 'Family_size_2', 'Family_size_3', 'Family_size_4',
       'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3',
       'Title_4'],
      dtype='object')

### RF

In [27]:
# Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X = df_train.copy().drop('Survived',axis=1)
y = np.array(df_train['Survived']).ravel()
X_test = df_test.copy()

# param grid
param_grid = {
    'n_estimators': [2,4,8,16,32,64,128],
    'max_depth' : [200,100,50,20,10],
    'criterion' : ['entropy','gini'],
    'min_samples_split' : [2,4,8], #[0.1,0.3,0.5,0.75,1.0],
    'min_samples_leaf' : [1,2,4], #np.linspace(0.1,0.5,num=5),
    'max_features' : np.linspace(0.1,1.0,num=10)
}

rf = RandomForestClassifier()

gscv = GridSearchCV(rf,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1,verbose=10)

result = gscv.fit(X=X,y=y)
print(result.best_params_)
print(result.best_score_)

Fitting 10 folds for each of 6300 candidates, totalling 63000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1899s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1547s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1467s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 332 tasks      | elapsed:

[Parallel(n_jobs=-1)]: Done 42982 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 43637 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 44292 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 44957 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 45622 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 46297 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 46972 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 47657 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 48342 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 49037 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 49732 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 50437 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 51142 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 51857 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 52572 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 53297 tasks 

{'criterion': 'entropy', 'max_depth': 50, 'max_features': 0.8, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 16}
0.8507295173961841


[Parallel(n_jobs=-1)]: Done 63000 out of 63000 | elapsed:  7.4min finished


In [41]:
best_rf_model = result.best_estimator_
best = best_rf_model.fit(X,y)
predictions = best.predict(X_test).astype(int)

In [None]:
# Best score now 0.77033

In [47]:
best_rf_model.feature_importances_

array([0.18542883, 0.21393965, 0.00616225, 0.007352  , 0.00927362,
       0.02553438, 0.00864756, 0.08194384, 0.09158007, 0.02510936,
       0.0011424 , 0.00127282, 0.00716925, 0.00182407, 0.00428699,
       0.        , 0.        , 0.        , 0.0105591 , 0.02767309,
       0.0040781 , 0.00954684, 0.00129774, 0.00316272, 0.01535054,
       0.00953438, 0.06043417, 0.18769622])

In [48]:
X.columns

Index(['Age', 'Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1', 'Cabin_2',
       'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7', 'Cabin_8',
       'Family_size_1', 'Family_size_2', 'Family_size_3', 'Family_size_4',
       'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3',
       'Title_4'],
      dtype='object')

In [None]:
## Fare?
## XGBoost?

# Submit to Kaggle

In [43]:
file_name = "solution_3.csv"
message = "RF CV with advanced features - bug fixed"
header = ['PassengerId','Survived']



pd.DataFrame(
    data=list(zip([x for x in test_df['PassengerId'].tolist()], predictions.tolist()))
).to_csv('{}'.format(file_name), index=False, header=header)

In [44]:
%%bash -s "$file_name" "$message"
kaggle competitions submit -c titanic -f $1 -m "$2"

Successfully submitted to Titanic: Machine Learning from Disaster

  0%|          | 0.00/2.77k [00:00<?, ?B/s]100%|██████████| 2.77k/2.77k [00:07<00:00, 356B/s]
