In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn



def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("Shape of training sample: ",train_df.shape)
print("Shape of testing sample: ",test_df.shape)

Shape of training sample:  (891, 12)
Shape of testing sample:  (418, 11)


# Exploration

In [None]:
train_df.columns

In [None]:
seaborn.pairplot(train_df)

In [None]:
train_df.head()

In [None]:
# Any nulls?
train_df.isna().sum(), test_df.isna().sum()

In [None]:
# Ticket / Name / PassengerId seems to be ver sparse - how many uniques we have?
print(train_df['Ticket'].nunique(), ' out of ', train_df.shape[0])
print(train_df['PassengerId'].nunique(), ' out of ', train_df.shape[0])
print(train_df['Name'].nunique(), ' out of ', train_df.shape[0])
print(train_df['Cabin'].nunique(), ' out of ', train_df.shape[0])

In [None]:
# Let's have a look at the other categorical features
categorical_columns = ['Survived', 'Pclass', 'Sex', 'Embarked']

for c in categorical_columns:
    print("Feature: ", c)
    print("Train: ")
    print(train_df[c].value_counts())
    if c is not 'Survived':
        print("Test: ")
        print(test_df[c].value_counts())

In [None]:
# Let's look at age, the text on Kaggle is a bit ambiguous about what age<1 means
train_df['Age'].plot(kind='hist')

In [None]:
# It looks okay, just a few babies it seems to me
train_df[train_df['Age']<=1]['Age'].plot(kind='hist')

In [None]:
# The text on Kaggle also mentions estimated ages have xx.5, how many are those?
print(train_df[(train_df['Age']-np.floor(train_df['Age']))==0.5].shape[0], ' out of ', train_df.shape[0])

# Naive first approach, RF CV with very simple features

In [None]:
# Dropping columns that are very sparse or have a lot of nulls
X = train_df.copy().drop('Survived',axis=1)
y = np.array(train_df['Survived']).ravel()
X_test = test_df.copy()

cat_columns = ['Embarked','Pclass','Sex']
cont_columns = ['Age','SibSp','Parch','Fare']
drop_columns = [x for x in X.columns if x not in (cat_columns + cont_columns)]
print('Dropping: ',drop_columns)

In [None]:
X.columns

In [None]:
X.drop(drop_columns,axis=1,inplace=True)
X_test.drop(drop_columns,axis=1,inplace=True)

In [None]:
# Imputing missing age with median, missing 'Embarked' with mode
X = X.fillna({'Age' : X['Age'].median(), 'Embarked' : X['Embarked'].mode()[0]})
X_test = X_test.fillna({'Age' : X['Age'].median(), 'Fare' : X['Fare'].median()})

In [None]:
# OHE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
[('bla',OneHotEncoder(categories='auto'),cat_columns)],remainder='passthrough')

X_ohe = ct.fit_transform(X)
X_test_ohe = ct.transform(X_test)

In [None]:
# Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# param grid
param_grid = {
    'n_estimators': [2,4,8,16,32,64,128],
    'max_depth' : [200,100,50,20],
    'criterion' : ['entropy'],
    'min_samples_split' : [0.1,0.3,0.5,0.75,1.0],
    'min_samples_leaf' : np.linspace(0.1,0.5,num=5),
    'max_features' : np.linspace(0.5,1.0,num=5+1)
}

rf = RandomForestClassifier()

gscv = GridSearchCV(rf,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1,verbose=10)

result = gscv.fit(X=X_ohe,y=y)
print(result.best_params_)
print(result.best_score_)

In [None]:
result.best_score_

In [None]:
best_rf_model = result.best_estimator_
best = best_rf_model.fit(X_ohe,y)
predictions = best.predict(X_test_ohe)

In [None]:
# First set of predictions - not very good - 0.76555 on public leaderboard

In [None]:
X_ohe.shape

In [None]:
# as you can see, mostof the featuers are ignored, we have to do some more feature engineering
best_rf_model.feature_importances_

In [None]:
# but it seems some of the other features are relevant but they are not included in the feature importances...
train_df.groupby('Embarked').Survived.mean()

In [None]:
## Other ideas to try
# Xgboost or something similar
# Interactions
# Extract more information from features not used now or with 0 importance
# Different imputations for age, Can we infer age from name?
# Kid with parent?
# Lived in cabin? (cabin not none)
# cabin location?

# More features & smarter imputation

### Missing values

In [64]:
# For missing values we consider both train and test to not bias
df_all = pd.concat([train_df,test_df])

In [65]:
# Missing values - Age
print(df_all.corrwith(df_all['Age']))

# Age is very correlated with pclass, so imputing with class averages
avg_age = df_all.groupby('Pclass')['Age'].median()
print('Average age per Pclass: ')
print(avg_age)

df_all['Age'] = np.where( (df_all['Age'].isnull()) & (df_all['Pclass']==1), avg_age[1],
                         np.where((df_all['Age'].isnull()) & (df_all['Pclass']==2), avg_age[2],
                                 np.where((df_all['Age'].isnull()) & (df_all['Pclass']==3), avg_age[3],df_all['Age'])))

Age            1.000000
Fare           0.178740
Parch         -0.150917
PassengerId    0.028814
Pclass        -0.408106
SibSp         -0.243699
Survived      -0.077221
dtype: float64
Average age per Pclass: 
Pclass
1    39.0
2    29.0
3    24.0
Name: Age, dtype: float64


In [66]:
df_all.columns

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket'],
      dtype='object')

In [67]:
# Missing values - Embarked
# Taking simple mode as only 2 are missing
df_all = df_all.fillna({'Embarked' : df_all['Embarked'].mode()[0]})

In [68]:
# Missing values - Fare
print(df_all[df_all['Fare'].isnull()])

# Only one, impute with average male fare with pclass 3
df_all = df_all.fillna({'Fare': df_all[ (df_all.Sex=='male') & (df_all.Pclass==3) ]['Fare'].median()})

      Age Cabin Embarked  Fare                Name  Parch  PassengerId  \
152  60.5   NaN        S   NaN  Storey, Mr. Thomas      0         1044   

     Pclass   Sex  SibSp  Survived Ticket  
152       3  male      0       NaN   3701  


In [69]:
# Missing values - Cabin
# Very difficult to figure out - treating missing values as a separate category
df_all['Cabin'] = df_all['Cabin'].apply(lambda x: str(x)[0])

In [70]:
# All good!
df_all.isnull().sum()

Age              0
Cabin            0
Embarked         0
Fare             0
Name             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived       418
Ticket           0
dtype: int64

### New features

In [71]:
df_all['Family_size'] = 1 + df_all['SibSp'] + df_all['Parch']
# Family sizes
print(df_all['Family_size'].value_counts().sort_index())

# Mapping for new feature
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Medium', 5: 'Medium', 6: 'Large', 7: 'Large', 8: 'Large', 11: 'Large'}
df_all['Family_size'] = df_all['Family_size'].map(family_map)

1     790
2     235
3     159
4      43
5      22
6      25
7      16
8       8
11     11
Name: Family_size, dtype: int64


In [72]:
# Married woman
df_all['Has_husband'] = np.where((df_all['Name'].str.contains('Mrs')) & (df_all['Sex']=='female') & (df_all['SibSp']==1),1,0)

In [73]:
# Titles
df_all['Title'] = df_all['Name'].str.split(',').apply(lambda x: x[1]).str.split(' ').apply(lambda x:x[1][:-1])
#df_all['Title'].value_counts()

# Map weird titles
df_all['Title'] = df_all['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df_all['Title'] = df_all['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev', 'th'], 'Dr/Military/Noble/Clergy')

df_all['Title'].value_counts()

Mr                          757
Miss/Mrs/Ms                 464
Master                       61
Dr/Military/Noble/Clergy     27
Name: Title, dtype: int64

In [74]:
def categorize(x):
    if x<80:
        return int(np.floor(x/10))
    if x>=80:
        return 8
df_all['Fare'] = df_all['Fare'].apply(categorize)

In [75]:
def cleanTicket(ticket):
    ticket = ticket.replace('.', '')
    ticket = ticket.replace('/', '')
    ticket = ticket.split()
    ticket = map(lambda t : t.strip(), ticket)
    ticket = list(filter(lambda t : not t.isdigit(), ticket))
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'XXX'

df_all['Ticket'] = df_all['Ticket'].apply(cleanTicket)

### Encoding

In [96]:
df_all.columns

df_train = df_all[~df_all.Survived.isnull()]
df_test = df_all[df_all.Survived.isnull()]

In [97]:
cat_columns = ['Embarked','Pclass','Sex','Cabin','Family_size','Has_husband','Title','Ticket']
cont_columns = ['Age','Fare']

In [98]:
from sklearn.preprocessing import OneHotEncoder

encoded_features = []

# Some ugly code to join transformed features back
# in readable format to train and test df
for df in [df_train,df_test]:
    for feature in cat_columns:
        encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
        n = df[feature].nunique()
        cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
        
        encoded_df = pd.DataFrame(encoded_feat, columns=cols)
        encoded_df.index = df.index
        encoded_features.append(encoded_df)

df_train = pd.concat([df_train, *encoded_features[:len(cat_columns)]], axis=1)
df_test = pd.concat([df_test, *encoded_features[len(cat_columns):]], axis=1)

In [99]:
df_train.columns

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket', 'Family_size',
       'Has_husband', 'Title', 'Embarked_1', 'Embarked_2', 'Embarked_3',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1',
       'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7',
       'Cabin_8', 'Cabin_9', 'Family_size_1', 'Family_size_2', 'Family_size_3',
       'Family_size_4', 'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2',
       'Title_3', 'Title_4', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4',
       'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9', 'Ticket_10',
       'Ticket_11', 'Ticket_12', 'Ticket_13', 'Ticket_14', 'Ticket_15',
       'Ticket_16', 'Ticket_17', 'Ticket_18', 'Ticket_19', 'Ticket_20',
       'Ticket_21', 'Ticket_22', 'Ticket_23', 'Ticket_24', 'Ticket_25',
       'Ticket_26', 'Ticket_27', 'Ticket_28', 'Ticket_29', 'Ticket_30',
       'Ticket_31'],
      dtype

In [100]:
keep_columns = (['Age','Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1',
       'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7',
       'Cabin_8', 'Cabin_9', 'Family_size_1', 'Family_size_2', 'Family_size_3',
       'Family_size_4', 'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3', 'Title_4', 'Survived','Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4',
       'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9', 'Ticket_10',
       'Ticket_11', 'Ticket_12', 'Ticket_13', 'Ticket_14', 'Ticket_15',
       'Ticket_16', 'Ticket_17', 'Ticket_18', 'Ticket_19', 'Ticket_20',
       'Ticket_21', 'Ticket_22', 'Ticket_23', 'Ticket_24', 'Ticket_25',
       'Ticket_26', 'Ticket_27', 'Ticket_28', 'Ticket_29', 'Ticket_30',
       'Ticket_31'])
keep_columns.remove('Cabin_9')
keep_columns = keep_columns[:-3]
df_train = df_train[keep_columns]
keep_columns.remove('Survived')
df_test = df_test[keep_columns]

In [101]:
df_train.columns

Index(['Age', 'Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1', 'Cabin_2',
       'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7', 'Cabin_8',
       'Family_size_1', 'Family_size_2', 'Family_size_3', 'Family_size_4',
       'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3',
       'Title_4', 'Survived', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4',
       'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9', 'Ticket_10',
       'Ticket_11', 'Ticket_12', 'Ticket_13', 'Ticket_14', 'Ticket_15',
       'Ticket_16', 'Ticket_17', 'Ticket_18', 'Ticket_19', 'Ticket_20',
       'Ticket_21', 'Ticket_22', 'Ticket_23', 'Ticket_24', 'Ticket_25',
       'Ticket_26', 'Ticket_27', 'Ticket_28'],
      dtype='object')

In [102]:
df_test.columns

Index(['Age', 'Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1', 'Cabin_2',
       'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7', 'Cabin_8',
       'Family_size_1', 'Family_size_2', 'Family_size_3', 'Family_size_4',
       'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3',
       'Title_4', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4', 'Ticket_5',
       'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9', 'Ticket_10',
       'Ticket_11', 'Ticket_12', 'Ticket_13', 'Ticket_14', 'Ticket_15',
       'Ticket_16', 'Ticket_17', 'Ticket_18', 'Ticket_19', 'Ticket_20',
       'Ticket_21', 'Ticket_22', 'Ticket_23', 'Ticket_24', 'Ticket_25',
       'Ticket_26', 'Ticket_27', 'Ticket_28'],
      dtype='object')

### RF

In [103]:
# Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X = df_train.copy().drop('Survived',axis=1)
y = np.array(df_train['Survived']).ravel()
X_test = df_test.copy()

# param grid
param_grid = {
    'n_estimators': [100,200,500,750],#[2,4,8,16,32,64,128],
    'max_depth' : [2,3,4,5,6],#,8,10],#[20,10,5], #[200,100,50,20,10],
    'criterion' : ['entropy','gini'],
    'min_samples_split' : [2],#,4,8,16], #[0.1,0.3,0.5,0.75,1.0],
    'min_samples_leaf' : [1],#,2,4,8], #np.linspace(0.1,0.5,num=5),
    'max_features' : np.linspace(0.1,1.0,num=10)
}

rf = RandomForestClassifier()

gscv = GridSearchCV(rf,param_grid=param_grid,scoring='accuracy',cv=5,n_jobs=-1,verbose=10)

result = gscv.fit(X=X,y=y)
print(result.best_params_)
print(result.best_score_)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1000s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0226s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.7079s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 187 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 203 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 219 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 236 tasks      | elapsed: 

{'criterion': 'gini', 'max_depth': 5, 'max_features': 0.30000000000000004, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
0.8338945005611672


[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  1.9min finished


In [104]:
best_rf_model = result.best_estimator_
best = best_rf_model.fit(X,y)
predictions = best.predict(X_test).astype(int)

In [106]:
best_rf_model.feature_importances_

array([6.06055379e-02, 5.23666916e-02, 6.57387074e-03, 4.65147739e-03,
       1.13313316e-02, 5.35119319e-02, 1.53104366e-02, 8.68994160e-02,
       1.91817036e-01, 1.27681542e-01, 1.13697707e-03, 3.43862089e-03,
       3.76583452e-03, 3.25295558e-03, 1.11727767e-02, 9.26726877e-04,
       1.17505210e-03, 1.65108101e-04, 8.57882137e-03, 3.01879591e-02,
       4.45967196e-03, 1.67320545e-02, 4.09122893e-03, 8.01329866e-03,
       5.73487182e-03, 1.99453794e-02, 1.18318449e-01, 1.23466577e-01,
       8.38615735e-06, 1.93835730e-03, 0.00000000e+00, 4.93389815e-04,
       1.75396643e-03, 0.00000000e+00, 1.82576033e-04, 1.52613704e-07,
       0.00000000e+00, 7.55518615e-04, 2.88373377e-03, 3.05092047e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.15818977e-06,
       0.00000000e+00, 3.34594955e-04, 5.79349044e-05, 1.04894878e-04,
       6.16507881e-05, 2.03045129e-03, 2.92373809e-06, 2.76970288e-04,
       0.00000000e+00, 6.75097490e-03, 9.44639387e-04, 5.80099931e-03])

In [107]:
X.columns

Index(['Age', 'Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Cabin_1', 'Cabin_2',
       'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7', 'Cabin_8',
       'Family_size_1', 'Family_size_2', 'Family_size_3', 'Family_size_4',
       'Has_husband_1', 'Has_husband_2', 'Title_1', 'Title_2', 'Title_3',
       'Title_4', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4', 'Ticket_5',
       'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9', 'Ticket_10',
       'Ticket_11', 'Ticket_12', 'Ticket_13', 'Ticket_14', 'Ticket_15',
       'Ticket_16', 'Ticket_17', 'Ticket_18', 'Ticket_19', 'Ticket_20',
       'Ticket_21', 'Ticket_22', 'Ticket_23', 'Ticket_24', 'Ticket_25',
       'Ticket_26', 'Ticket_27', 'Ticket_28'],
      dtype='object')

# XGBoost (just trying, not optimizing)

In [3]:
from xgboost import XGBClassifier

In [25]:
model = XGBClassifier()
model.fit(X, y)
predictions = model.predict(X_test)

# Submit to Kaggle

In [108]:
file_name = "solution_6.csv"
message = "RF - added ticket text munging"
header = ['PassengerId','Survived']



pd.DataFrame(
    data=list(zip([x for x in test_df['PassengerId'].tolist()], [int(x) for x in predictions.tolist()]))
).to_csv('{}'.format(file_name), index=False, header=header)

In [109]:
%%bash -s "$file_name" "$message"
kaggle competitions submit -c titanic -f $1 -m "$2"

Successfully submitted to Titanic: Machine Learning from Disaster

  0%|          | 0.00/2.77k [00:00<?, ?B/s]100%|██████████| 2.77k/2.77k [00:00<00:00, 13.9kB/s]
