In [1]:
import pandas as pd
import numpy as np

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
titanic = pd.read_csv('datasets/titanic/train.csv')

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
## target class: survived!

In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Let's try imputing vals to missing vals

In [6]:
missing_val_count_by_column = (titanic.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Age         177
Cabin       687
Embarked      2
dtype: int64


In [7]:
# Impute Embarked
titanic['Embarked'] = titanic['Embarked'].fillna('S')

In [8]:
# Impute Cabin with NA, later we will create a new feat based on this
titanic['Cabin'] = titanic['Cabin'].fillna('NA')

### Let's prepare train set for training except for Age (ie. feat engineering, etc in all but Age)

In [12]:
titanic['Has_cabin'] = titanic['Cabin'].apply(lambda x: 0 if x == 'NA' else 1)
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic['Is_alone'] = titanic['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

#### Scale Fare

In [14]:
from sklearn.preprocessing import RobustScaler

In [15]:
fareScaler = RobustScaler()
titanic['Fare_scaled'] = fareScaler.fit_transform(titanic['Fare'].values.reshape(-1, 1))

In [20]:
unnecessary_cols = ['PassengerId','Name','SibSp','Parch','Ticket','Fare','Cabin']

In [18]:
titanic.drop(unnecessary_cols, inplace=True, axis=1)

In [23]:
titanic = pd.get_dummies(titanic)

In [24]:
titanic.head()

Unnamed: 0,Survived,Pclass,Age,Has_cabin,FamilySize,Is_alone,Fare_scaled,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,0,2,0,-0.312011,0,1,0,0,1
1,1,1,38.0,1,2,0,2.461242,1,0,1,0,0
2,1,3,26.0,0,1,1,-0.282777,1,0,0,0,1
3,1,1,35.0,1,2,0,1.673732,1,0,0,0,1
4,0,3,35.0,0,1,1,-0.277363,0,1,0,0,1


### Train a regressor on Age

In [10]:
mv_cols = (titanic.isnull().sum())
print(mv_cols[mv_cols > 0])

Age    177
dtype: int64


In [39]:
#### X and y must contain rows where Age isnt null!
X = titanic.dropna().drop(['Age','Survived'], inplace=False, axis=1)
y = titanic['Age'].dropna()

In [45]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score

In [44]:
from sklearn.ensemble import RandomForestRegressor

#### RandomForest Regressor

In [46]:
kFold = KFold(n_splits=10, random_state=0, shuffle=True)

In [47]:
maes = []
r2s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelRF = RandomForestRegressor(random_state=0)
    
    modelRF.fit(X_train, y_train)
    preds = modelRF.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print('MAE:', mae, 'R2:', r2)
    maes.append(mae)
    r2s.append(r2)
    print()
print('Avg MAE:', np.mean(maes))
print('Avg R2:', np.mean(r2s))

Fold: 0
MAE: 10.072620516421146 R2: 0.2488629189279603

Fold: 1
MAE: 12.251289659598378 R2: -0.1451959627011068

Fold: 2
MAE: 12.137637318139644 R2: -0.19268821465477526

Fold: 3
MAE: 10.275461113393675 R2: -0.025747295965236727

Fold: 4
MAE: 11.74256303595784 R2: 0.04201155481303742

Fold: 5
MAE: 10.163494983724176 R2: 0.09666681979688463

Fold: 6
MAE: 10.173544438148811 R2: 0.011709429210824629

Fold: 7
MAE: 12.537175359377803 R2: -0.007392677370930967

Fold: 8
MAE: 9.773603894591922 R2: 0.2621603680781537

Fold: 9
MAE: 11.920618647851914 R2: 0.027059326154363683

Avg MAE: 11.10480089672053
Avg R2: 0.03174462662891746


##### --- r2 is very low capturing the fact that the features are not responsible for the target variable (Age) ---

##### Let's do some grid search

In [49]:
from sklearn.model_selection import GridSearchCV

In [48]:
params = {
    'n_estimators': list(range(150,250,10)),
    'max_depth': list(range(5, 20, 5)),
    'max_features': [2, 3, 4, 5],
    'max_leaf_nodes': list(range(5, 20, 5))
}

In [50]:
gridRF = GridSearchCV(modelRF, param_grid=params, cv=10)
gridRF.fit(X, y)
print(gridRF.best_params_)

{'max_depth': 5, 'n_estimators': 220, 'max_features': 4, 'max_leaf_nodes': 15}


In [51]:
maes = []
r2s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelRF = RandomForestRegressor(random_state=0,
                                    max_depth=5,
                                    n_estimators=220,
                                    max_features=4,
                                    max_leaf_nodes=15
                                   )
    
    modelRF.fit(X_train, y_train)
    preds = modelRF.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print('MAE:', mae, 'R2:', r2)
    maes.append(mae)
    r2s.append(r2)
    print()
print('Avg MAE:', np.mean(maes))
print('Avg R2:', np.mean(r2s))

Fold: 0
MAE: 10.383230891589985 R2: 0.2449069230320191

Fold: 1
MAE: 10.271615464977181 R2: 0.15066254723369243

Fold: 2
MAE: 9.885935606324022 R2: 0.21769477680544325

Fold: 3
MAE: 9.400404675281889 R2: 0.19794002725035909

Fold: 4
MAE: 10.77375332100911 R2: 0.1654973751649368

Fold: 5
MAE: 9.548333305307121 R2: 0.23914995952101126

Fold: 6
MAE: 9.628250387047627 R2: 0.21376867295564494

Fold: 7
MAE: 11.596591999047153 R2: 0.15538225521212767

Fold: 8
MAE: 9.074409171574796 R2: 0.3721670368456821

Fold: 9
MAE: 10.835000623726168 R2: 0.17863669918573233

Avg MAE: 10.139752544588507
Avg R2: 0.2135806273206649


#### XGBoost Regressor

In [52]:
from xgboost import XGBRegressor

In [53]:
maes = []
r2s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelXGB = XGBRegressor(random_state=0)
    
    modelXGB.fit(X_train, y_train)
    preds = modelXGB.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print('MAE:', mae, 'R2:', r2)
    maes.append(mae)
    r2s.append(r2)
    print()
print('Avg MAE:', np.mean(maes))
print('Avg R2:', np.mean(r2s))

Fold: 0
MAE: 11.678667347696091 R2: 0.060401911021580434

Fold: 1
MAE: 12.010185613632201 R2: -0.22776893082839655

Fold: 2
MAE: 11.902800056669447 R2: -0.15911992431567512

Fold: 3
MAE: 11.311499496301016 R2: -0.24455072448962056

Fold: 4
MAE: 11.278886575698852 R2: 0.023823445028524737

Fold: 5
MAE: 10.600565056330723 R2: -0.02745982259959967

Fold: 6
MAE: 10.622829209985866 R2: -0.00040003604583493946

Fold: 7
MAE: 13.160571890817561 R2: -0.041588841503691576

Fold: 8
MAE: 10.544452902296898 R2: 0.11983608302112203

Fold: 9
MAE: 12.94750226047677 R2: -0.18128799161553122

Avg MAE: 11.605796040990542
Avg R2: -0.06781148323271224


##### Grid Search on XGBRegressor

In [54]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'n_estimators': [100, 200, 250],
        'max_depth': [4, 6, 10, 15]
        }

In [55]:
gridXGB = GridSearchCV(modelXGB, param_grid=params, cv=10)
gridXGB.fit(X, y)
print(gridXGB.best_params_)

{'max_depth': 4, 'gamma': 5, 'colsample_bytree': 0.6, 'n_estimators': 100, 'min_child_weight': 10, 'subsample': 1.0}


In [56]:
maes = []
r2s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelXGB = XGBRegressor(random_state=0,
                            max_depth=4,
                            gamma=5,
                            colsample_bytree=0.6,
                            n_estimators=100,
                            min_child_weight=10,
                            subsample=1.0
                           )
    
    modelXGB.fit(X_train, y_train)
    preds = modelXGB.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print('MAE:', mae, 'R2:', r2)
    maes.append(mae)
    r2s.append(r2)
    print()
print('Avg MAE:', np.mean(maes))
print('Avg R2:', np.mean(r2s))

Fold: 0
MAE: 10.20902492046356 R2: 0.23510534101374791

Fold: 1
MAE: 11.05487967967987 R2: -0.03586527633880565

Fold: 2
MAE: 11.146930042240355 R2: -0.038208342870536205

Fold: 3
MAE: 10.373601290914747 R2: -0.01214913623345204

Fold: 4
MAE: 10.734323715424873 R2: 0.15324050917399046

Fold: 5
MAE: 9.971795094449757 R2: 0.14751773177362915

Fold: 6
MAE: 9.743791769323213 R2: 0.1525727487998254

Fold: 7
MAE: 12.081312260157626 R2: 0.05342366160685463

Fold: 8
MAE: 8.965310311653244 R2: 0.3794218195484308

Fold: 9
MAE: 11.855451053297015 R2: 0.06528819153996956

Avg MAE: 10.613642013760426
Avg R2: 0.11003472480136538


### Let's impute Age with our RF regressor

In [57]:
X.head()

Unnamed: 0,Pclass,Has_cabin,FamilySize,Is_alone,Fare_scaled,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,0,2,0,-0.312011,0,1,0,0,1
1,1,1,2,0,2.461242,1,0,1,0,0
2,3,0,1,1,-0.282777,1,0,0,0,1
3,1,1,2,0,1.673732,1,0,0,0,1
4,3,0,1,1,-0.277363,0,1,0,0,1


In [88]:
titanic_AgeNull = titanic[titanic['Age'].isnull()] ## 177 rows
titanic_AgeNotNull = titanic[titanic['Age'].notnull()] ## 714 rows

In [90]:
preds = modelRF.predict(titanic_AgeNull.drop(['Survived','Age'], axis=1, inplace=False))

In [91]:
preds.shape

(177,)

In [92]:
titanic_AgeNull.head()

Unnamed: 0,Survived,Pclass,Age,Has_cabin,FamilySize,Is_alone,Fare_scaled,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
5,0,3,,0,1,1,-0.25968,0,1,0,1,0
17,1,2,,0,1,1,-0.062981,0,1,0,0,1
19,1,3,,0,1,1,-0.313093,1,0,1,0,0
26,0,3,,0,1,1,-0.313093,0,1,1,0,0
28,1,3,,0,1,1,-0.28476,1,0,0,1,0


In [96]:
titanic_AgeImputed = pd.DataFrame({'Survived':titanic_AgeNull['Survived'],
                                   'Pclass': titanic_AgeNull['Pclass'],
                                   'Age': titanic_AgeNull['Age'],
                                   'Has_cabin': titanic_AgeNull['Has_cabin'],
                                   'FamilySize': titanic_AgeNull['FamilySize'],
                                   'Is_alone': titanic_AgeNull['Is_alone'],
                                   'Fare_scaled': titanic_AgeNull['Fare_scaled'],
                                   'Sex_female': titanic_AgeNull['Sex_female'],
                                   'Sex_male': titanic_AgeNull['Sex_male'],
                                   'Embarked_C': titanic_AgeNull['Embarked_C'],
                                   'Embarked_Q': titanic_AgeNull['Embarked_Q'],
                                   'Embarked_S': titanic_AgeNull['Embarked_S'],
                                   'Age_imputed': preds
                                  })

In [97]:
titanic_AgeImputed.head()

Unnamed: 0,Age,Age_imputed,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Fare_scaled,Has_cabin,Is_alone,Pclass,Sex_female,Sex_male,Survived
5,,29.215264,0,1,0,1,-0.25968,0,1,3,0,1,0
17,,33.27115,0,0,1,1,-0.062981,0,1,2,0,1,1
19,,24.274748,1,0,0,1,-0.313093,0,1,3,1,0,1
26,,28.44625,1,0,0,1,-0.313093,0,1,3,0,1,0
28,,25.184539,0,1,0,1,-0.28476,0,1,3,1,0,1


In [98]:
titanic_AgeNotNull = pd.DataFrame({'Survived':titanic_AgeNotNull['Survived'],
                                   'Pclass': titanic_AgeNotNull['Pclass'],
                                   'Age': titanic_AgeNotNull['Age'],
                                   'Has_cabin': titanic_AgeNotNull['Has_cabin'],
                                   'FamilySize': titanic_AgeNotNull['FamilySize'],
                                   'Is_alone': titanic_AgeNotNull['Is_alone'],
                                   'Fare_scaled': titanic_AgeNotNull['Fare_scaled'],
                                   'Sex_female': titanic_AgeNotNull['Sex_female'],
                                   'Sex_male': titanic_AgeNotNull['Sex_male'],
                                   'Embarked_C': titanic_AgeNotNull['Embarked_C'],
                                   'Embarked_Q': titanic_AgeNotNull['Embarked_Q'],
                                   'Embarked_S': titanic_AgeNotNull['Embarked_S'],
                                   'Age_imputed': titanic_AgeNotNull['Age']
                                  })

In [99]:
titanic_AgeNotNull.head()

Unnamed: 0,Age,Age_imputed,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Fare_scaled,Has_cabin,Is_alone,Pclass,Sex_female,Sex_male,Survived
0,22.0,22.0,0,0,1,2,-0.312011,0,0,3,0,1,0
1,38.0,38.0,1,0,0,2,2.461242,1,0,1,1,0,1
2,26.0,26.0,0,0,1,1,-0.282777,0,1,3,1,0,1
3,35.0,35.0,0,0,1,2,1.673732,1,0,1,1,0,1
4,35.0,35.0,0,0,1,1,-0.277363,0,1,3,0,1,0


In [102]:
titanic_pre = titanic_AgeNotNull.append(titanic_AgeImputed)

In [103]:
titanic_pre.drop(['Age'], axis=1, inplace=True)

In [108]:
titanic_pre['Age_imputed'] = titanic_pre['Age_imputed'].apply(np.ceil)

In [109]:
titanic_pre.head()

Unnamed: 0,Age_imputed,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Fare_scaled,Has_cabin,Is_alone,Pclass,Sex_female,Sex_male,Survived
0,22.0,0,0,1,2,-0.312011,0,0,3,0,1,0
1,38.0,1,0,0,2,2.461242,1,0,1,1,0,1
2,26.0,0,0,1,1,-0.282777,0,1,3,1,0,1
3,35.0,0,0,1,2,1.673732,1,0,1,1,0,1
4,35.0,0,0,1,1,-0.277363,0,1,3,0,1,0


In [110]:
titanic_pre.to_csv('datasets/titanic/train_ageImputedWRF.csv')

In [111]:
mv_cols = (titanic_pre.isnull().sum())
print(mv_cols[mv_cols > 0])

Series([], dtype: int64)


### More viz

In [None]:
"""
    SURVIVED
"""
plt.figure(figsize=(10, 5))
sns.countplot(x='Survived', data=titanic)
plt.show()

In [None]:
"""
    PCLASS
"""
plt.figure(figsize=(10, 5))
sns.countplot(x='Pclass', data=titanic)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='Pclass', hue='Survived', data=titanic)
plt.show()

In [None]:
"""
    SEX
"""
plt.figure(figsize=(10, 5))
sns.countplot(x='Sex', data=titanic)
plt.show()
### more men than women!

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='Sex', hue='Survived', data=titanic)
plt.show()
## more women survived tho!

In [None]:
"""
    EMBARKED
"""
plt.figure(figsize=(10, 5))
sns.countplot(x='Embarked', data=titanic)
plt.show()
### most people boarded on southampton

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='Embarked', hue='Survived', data=titanic)
plt.show()

In [None]:
"""
    PARCH
"""
plt.figure(figsize=(10, 5))
sns.countplot(x='Parch', data=titanic)
plt.show()
### most people traveled without parents or children

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='Parch', hue='Survived', data=titanic)
plt.show()

In [None]:
"""
    SIBSP
"""
plt.figure(figsize=(10, 5))
sns.countplot(x='SibSp', data=titanic)
plt.show()
## most people traveled without siblings or spouse

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='SibSp', hue='Survived', data=titanic)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.jointplot(x="Parch", y="SibSp", data=titanic);
plt.show()

In [None]:
"""
    AGE
"""
plt.figure(figsize=(10, 10))
sns.boxplot(x='Survived', y='Age', data=titanic)
plt.show()
## almost safe to say that if you were a child you would've survived

In [None]:
"""
    FARE
"""
plt.figure(figsize=(10, 10))
sns.boxplot(x='Survived', y='Fare', data=titanic)
plt.show()
### mean are different. Do a t-test for significance? Cant be done! Data isnt gaussian-like
### people who paid larger fares were more likely to survive

### Here we have finished visualizing and we're ready to drop cols

In [None]:
missing_val_count_by_column = (titanic.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
#### cabin isn't too informative
## let's drop it too
unnecessary_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Cabin_encoded']

In [None]:
titanic.head()

#### Let's drop "unnecessary" cols

In [None]:
#### cabin isn't too informative
## let's drop it too
unnecessary_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Cabin_encoded']
titanic.drop(unnecessary_cols, inplace=True, axis=1)

In [None]:
titanic = pd.get_dummies(titanic)
### why pclass isnt created dummies? cos it's not categorical :P

In [None]:
titanic.head()

### Scaling num vars

In [None]:
### how is age distributed?
plt.figure(figsize=(16, 7))
sns.distplot(titanic['Age'])
plt.show()

In [None]:
### what about fare?
plt.figure(figsize=(16, 7))
sns.distplot(titanic['Fare'])
plt.show()
#### right skewed

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ageSC = StandardScaler()

In [None]:
ageSC.fit(titanic['Age'].values.reshape(-1, 1))

In [None]:
titanic['Age'] = ageSC.transform(titanic['Age'].values.reshape(-1, 1))

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(titanic['Age'])
plt.show()

### Age has lots of outliers. Perhaps another scaling strategy would be better?

In [None]:
"""
    AGE
"""
plt.figure(figsize=(10, 10))
sns.boxplot(x='Survived', y='Age', data=titanic)
plt.show()

#### Fare is right-skewed. Let's use something to make it more Gaussian-like!

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
titanic[titanic['Fare'] == 0]
### some people traveled for free?? then, no Box-Cox transform can be applied :-(

In [None]:
ptFare = PowerTransformer()

In [None]:
ptFare.fit(titanic['Fare'].values.reshape(-1, 1))

In [None]:
titanic['Fare'] = ptFare.transform(titanic['Fare'].values.reshape(-1, 1))

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(titanic['Fare'])
plt.show()
"""
is it more gaussian-like?? how does it compare vs target var?
"""

In [None]:
"""
    FARE
"""
plt.figure(figsize=(10, 10))
sns.boxplot(x='Survived', y='Fare', data=titanic)
plt.show()
"""
t-test now?
now it's more evident that both distros differ
there're less outliers
it's clearer that people that paid more had more
chances of survival
"""

In [None]:
titanic.head()

## Are we ready to model?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
X = titanic.drop(['Survived'], axis=1, inplace=False)
y = titanic['Survived']

#### A small detour to do cross validation

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

In [None]:
kFold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
accs = []
f1s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print("Fold:", i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    modelLR = LogisticRegression(random_state=0)
    modelLR.fit(X_train, y_train)
    preds = modelLR.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    print("Acc:", acc, "F1-score:", f1)
    accs.append(acc)
    f1s.append(f1)
    print()
print("Mean acc:", np.mean(accs))
print("Mean f1 score:", np.mean(f1s))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Stuff to try:
### ROC curves
### Grid search
### XGBoost, NNs
### feature importance

### First impressions with classification. We will have a look to feature importance too

In [None]:
modelLR = LogisticRegression(random_state=0)
modelLR.fit(X_train, y_train)
preds = modelLR.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
feature_importance = pd.DataFrame({'Feature': X.columns.tolist(), 'Coefficients': modelLR.coef_[0]})

In [None]:
feature_importance

In [None]:
plt.figure(figsize=(16, 7))
sns.barplot(x='Feature', y='Coefficients', data=feature_importance)
plt.xticks(rotation=45)
plt.show()

### Let's have a look at ROC curve

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
probs = modelLR.predict_proba(X_test)
probs = probs[:, 1]

In [None]:
fprLR, tprLR, thresholdLR = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprLR, tprLR))

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fprLR, tprLR, marker='.')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1])
# show the plot
plt.show()

In [None]:
i = np.arange(len(tprLR))
roc = pd.DataFrame({'tf': pd.Series(tprLR-(1-fprLR), index=i), 'threshold': pd.Series(thresholdLR, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]                 
print('Best thresh:', list(roc_t['threshold']))

In [None]:
#def preds_by_thresh(prob_vec, thresh):
#thresh = 0.329988328719691
thresh = 0.329988328719691
preds = []
for proba in probs:
    if proba >= thresh:
        preds.append(1)
    else:
        preds.append(0)

In [None]:
accuracy_score(y_test, preds)
# 0.8022388059701493 -- thresh obtained from ROC analysis
# 0.8059701492537313 -- thresh = 0.5

In [None]:
f1_score(y_test, preds)
# 0.7511737089201878 -- comments as above!
# 0.7373737373737373 

In [None]:
confusion_matrix(y_test, preds)

In [None]:
"""
    The thresh above was estimated using X_test and y_test (i.e. VALIDATION SET)
    How do we know that the same thresh will result in the
    same acc and f1-score when applied in REAL test set???
    there is only one way: TRY IT!
"""

#### Let's try a simple decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
modelDT = DecisionTreeClassifier(random_state=0, max_depth=5)
modelDT.fit(X_train, y_train)
preds = modelDT.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
feature_importance = pd.DataFrame({'Feature': X.columns.tolist(), 'Score': modelDT.feature_importances_})

In [None]:
feature_importance

In [None]:
plt.figure(figsize=(16, 7))
sns.barplot(x='Feature', y='Score', data=feature_importance)
plt.xticks(rotation=45)
plt.show()

In [None]:
probs = modelDT.predict_proba(X_test)
probs = probs[:, 1]

In [None]:
fprDT, tprDT, thresholdDT = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprDT, tprDT))

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fprLR, tprLR)
plt.plot(fprDT, tprDT)
plt.legend(['LR','DT'])
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1])
# show the plot
plt.show()

In [None]:
i = np.arange(len(tprDT))
roc = pd.DataFrame({'tf': pd.Series(tprDT-(1-fprDT), index=i), 'threshold': pd.Series(thresholdDT, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
thresh = 0.5
preds = []
for proba in probs:
    if proba >= thresh:
        preds.append(1)
    else:
        preds.append(0)

In [None]:
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

#### Let's try SVMs now

In [None]:
modelSVC = SVC(random_state=0, kernel='linear', probability=True)
modelSVC.fit(X_train, y_train)
preds = modelSVC.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
feature_importance = pd.DataFrame({'Feature': X.columns.tolist(), 'Coefficients': modelSVC.coef_[0]})

In [None]:
feature_importance

In [None]:
plt.figure(figsize=(16, 7))
sns.barplot(x='Feature', y='Coefficients', data=feature_importance)
plt.xticks(rotation=45)
plt.show()

In [None]:
probs = modelSVC.predict_proba(X_test)
probs = probs[:, 1]

In [None]:
fprSVC, tprSVC, thresholdSVC = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprSVC, tprSVC))

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fprLR, tprLR)
plt.plot(fprDT, tprDT)
plt.plot(fprSVC, tprSVC)
plt.legend(['LR','DT', 'SVC'])
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1])
# show the plot
plt.show()

In [None]:
i = np.arange(len(tprSVC))
roc = pd.DataFrame({'tf': pd.Series(tprSVC-(1-fprSVC), index=i), 'threshold': pd.Series(thresholdSVC, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
thresh = 0.19744104517882796
preds = []
for proba in probs:
    if proba >= thresh:
        preds.append(1)
    else:
        preds.append(0)

In [None]:
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

### Not very good performance w estimators out of the box, possibly we'll need to try a different feature selection/scaling strategy

### In the meantime, let's continue playing around w different models

#### Random Forest

In [None]:
modelRF = RandomForestClassifier(random_state=0, max_depth=5, n_estimators=200)                                                                      
modelRF.fit(X_train, y_train)
preds = modelRF.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
feature_importance = pd.DataFrame({'Feature': X.columns.tolist(), 'Score': modelRF.feature_importances_})

In [None]:
feature_importance

In [None]:
plt.figure(figsize=(16, 7))
sns.barplot(x='Feature', y='Score', data=feature_importance)
plt.xticks(rotation=45)
plt.show()

In [None]:
probs = modelRF.predict_proba(X_test)
probs = probs[:, 1]

In [None]:
fprRF, tprRF, thresholdRF = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprRF, tprRF))

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fprLR, tprLR)
plt.plot(fprDT, tprDT)
plt.plot(fprSVC, tprSVC)
plt.plot(fprRF, tprRF)
plt.legend(['LR','DT','SVC','RF'])
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1])
# show the plot
plt.show()

In [None]:
i = np.arange(len(tprRF))
roc = pd.DataFrame({'tf': pd.Series(tprRF-(1-fprRF), index=i), 'threshold': pd.Series(thresholdRF, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
thresh = 0.3254039087203783
preds = []
for proba in probs:
    if proba >= thresh:
        preds.append(1)
    else:
        preds.append(0)

In [None]:
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
modelXGB = XGBClassifier(random_state=0, n_estimators=200)
modelXGB.fit(X_train, y_train)
preds = modelXGB.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
feature_importance = pd.DataFrame({'Feature': X.columns.tolist(), 'Score': modelXGB.feature_importances_})

In [None]:
feature_importance

In [None]:
plt.figure(figsize=(16, 7))
sns.barplot(x='Feature', y='Score', data=feature_importance)
plt.xticks(rotation=45)
plt.show()

In [None]:
probs = modelXGB.predict_proba(X_test)
probs = probs[:, 1]

In [None]:
fprXGB, tprXGB, thresholdXGB = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprXGB, tprXGB))

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fprLR, tprLR)
plt.plot(fprDT, tprDT)
plt.plot(fprSVC, tprSVC)
plt.plot(fprRF, tprRF)
plt.plot(fprXGB, tprXGB)
plt.legend(['LR','DT','SVC','RF','XGB'])
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1])
# show the plot
plt.show()

In [None]:
i = np.arange(len(tprXGB))
roc = pd.DataFrame({'tf': pd.Series(tprXGB-(1-fprXGB), index=i), 'threshold': pd.Series(thresholdXGB, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
thresh = 0.2954808473587036
preds = []
for proba in probs:
    if proba >= thresh:
        preds.append(1)
    else:
        preds.append(0)

In [None]:
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

#### Let's try a NN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
"""
+++++++++++ 1 +++++++++++
modelNN = keras.Sequential([
        layers.Dense(100, activation=tf.nn.relu, input_shape=[input_size,]),
        layers.Dense(1, activation=tf.nn.sigmoid)
    ])
"""
def createNN(input_size):
    modelNN = keras.Sequential([
        layers.Dense(100, activation=tf.nn.relu, input_shape=[input_size,]),
#         layers.Dropout(0.2),
#         layers.Dense(256, activation=tf.nn.relu),
#         layers.Dropout(0.2),
        layers.Dense(1, activation=tf.nn.sigmoid)
    ])
    modelNN.compile(loss='binary_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])
    return modelNN

In [None]:
X.columns

In [None]:
modelNN = createNN(len(X.keys()))

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10)

In [None]:
modelNN.summary()

In [None]:
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
  
    plt.figure(figsize=(10, 7))
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.plot(hist['epoch'], 
           hist['accuracy'], 
           label='Train Acc')
    plt.plot(hist['epoch'], 
           hist['val_accuracy'],
          label='Val Acc')
    plt.legend()

In [None]:
epochs = 100
history = modelNN.fit(
    X_train,
    y_train,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stop]
)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
plot_history(history)

In [None]:
loss, acc = modelNN.evaluate(X_test, y_test, verbose=0)
print('Accuracy on test set:', acc)

#### let's try some CV with all data on the NN

In [None]:
kFold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
accs = []
early_stopCV = keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print("Fold:", i)
    X_trainCV, X_testCV = X.iloc[train_idx], X.iloc[test_idx]
    y_trainCV, y_testCV = y.iloc[train_idx], y.iloc[test_idx]

    modelNN = createNN(len(X_train.keys()))

    modelNN.fit(
        X_trainCV,
        y_trainCV,
        epochs=100,
        verbose=0,
        callbacks=[early_stopCV]
    )

    _, acc = modelNN.evaluate(X_testCV, y_testCV, verbose=0)
    print('Accuracy:', acc)
    accs.append(acc)        
    print()
print("Mean acc:", np.mean(accs))

##### Not bad and promising!
### Let's continue our analysis

In [None]:
probs = modelNN.predict(X_test)

In [None]:
preds = (probs > 0.5)

In [None]:
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

##### How to get feature importance from a Keras Model?

In [None]:
fprNN, tprNN, thresholdNN = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprNN, tprNN))

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fprLR, tprLR)
plt.plot(fprDT, tprDT)
plt.plot(fprSVC, tprSVC)
plt.plot(fprRF, tprRF)
plt.plot(fprXGB, tprXGB)
plt.plot(fprNN, tprNN)
plt.legend(['LR','DT','SVC','RF','XGB','NN'])
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1])
# show the plot
plt.show()

In [None]:
i = np.arange(len(tprNN))
roc = pd.DataFrame({'tf': pd.Series(tprNN-(1-fprNN), index=i), 'threshold': pd.Series(thresholdNN, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
preds = (probs >= 0.2982158958911896)

In [None]:
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

## Let's play around tuning estimators' params

In [None]:
from sklearn.model_selection import GridSearchCV

### Grid search for SVM

In [None]:
### for SVM
params = {
    'C': [0.1, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4]
}

In [None]:
gridSVM = GridSearchCV(modelSVC, param_grid=params, scoring='accuracy', cv=10)

In [None]:
gridSVM.fit(X, y)

In [None]:
gridSVM.best_params_

In [None]:
modelSVC = SVC(random_state=0, kernel='rbf', probability=True, C=1.1, gamma=0.1)
modelSVC.fit(X_train, y_train)
preds = modelSVC.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
probs = modelSVC.predict_proba(X_test)
probs = probs[:, 1]

In [None]:
fprSVM, tprSVM, thresholdSVM = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprSVM, tprSVM))

In [None]:
i = np.arange(len(tprSVM))
roc = pd.DataFrame({'tf': pd.Series(tprSVM-(1-fprSVM), index=i), 'threshold': pd.Series(thresholdSVM, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
preds = (probs >= 0.19456617888293143)
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))
"""
    what is more important to you? to correctly classify those who will die or those who will survive?
"""

### Grid search for LR

In [None]:
### for LR
params = {
    'C': np.logspace(-4, 4, 20),
}

In [None]:
gridLR = GridSearchCV(modelLR, param_grid=params, scoring='accuracy', cv=10)

In [None]:
gridLR.fit(X, y)

In [None]:
gridLR.best_params_

In [None]:
modelLR = LogisticRegression(random_state=0, C=0.08858667904100823)
modelLR.fit(X_train, y_train)
preds = modelLR.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
probs = modelLR.predict_proba(X_test)
probs = probs[:, 1]

In [None]:
fprLR, tprLR, thresholdLR = roc_curve(y_test, probs, pos_label=1)

In [None]:
print("AUC:", auc(fprLR, tprLR))

In [None]:
i = np.arange(len(tprLR))
roc = pd.DataFrame({'tf': pd.Series(tprLR-(1-fprLR), index=i), 'threshold': pd.Series(thresholdLR, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
preds = (probs >= 0.3578465355584938)
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

### Grid search for RF

In [None]:
### for RF
params = {
    'n_estimators': list(range(150,250,10)),
    #'max_features': list(range(5,10,1)),
    'max_depth': list(range(5, 20, 5))
}

In [None]:
gridRF = GridSearchCV(modelRF, param_grid=params, scoring='accuracy', cv=10)
gridRF.fit(X, y)
print(gridRF.best_params_)

In [None]:
modelRF = RandomForestClassifier(random_state=0, max_depth=10, n_estimators=200)
modelRF.fit(X_train, y_train)
preds = modelRF.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
probs = modelRF.predict_proba(X_test)
probs = probs[:, 1]
fprRF, tprRF, thresholdRF = roc_curve(y_test, probs, pos_label=1)
print("AUC:", auc(fprRF, tprRF))
i = np.arange(len(tprRF))
roc = pd.DataFrame({'tf': pd.Series(tprRF-(1-fprRF), index=i), 'threshold': pd.Series(thresholdRF, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
preds = (probs >= 0.35645055786394053)
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
"""
    ON YOUR SUBMISSION TO KAGGLE SUBMIT 2 VERSIONS: ONE THRESHOLDED AND ONE AS IT COMES OUT OF THE BOX
"""

### Grid search for XGB

In [None]:
modelXGB.get_params()

In [None]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'n_estimators': [100, 200, 250],
        'max_depth': [4, 6, 10, 15]
        }

In [None]:
gridXGB = GridSearchCV(modelXGB, param_grid=params, scoring='accuracy', cv=10)
gridXGB.fit(X, y)
print(gridXGB.best_params_)

In [None]:
modelXGB = XGBClassifier(
    random_state=0,
    colsample_bytree=1.0,
    max_depth=6, 
    n_estimators=250,
    gamma=2,
    min_child_weight=5,
    subsample=0.8
)
modelXGB.fit(X_train, y_train)
preds = modelXGB.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
probs = modelXGB.predict_proba(X_test)
probs = probs[:, 1]
fprXGB, tprXGB, thresholdXGB = roc_curve(y_test, probs, pos_label=1)
print("AUC:", auc(fprXGB, tprXGB))
i = np.arange(len(tprXGB))
roc = pd.DataFrame({'tf': pd.Series(tprXGB-(1-fprXGB), index=i), 'threshold': pd.Series(thresholdXGB, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
preds = (probs >= 0.35376712679862976)
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

### Let's improve the NN

In [None]:
"""
+++++++++++ 1 +++++++++++
modelNN = keras.Sequential([
        layers.Dense(100, activation=tf.nn.relu, input_shape=[input_size,]),
        layers.Dense(1, activation=tf.nn.sigmoid)
    ])
+++++++++++ 2 +++++++++++
modelNN = keras.Sequential([
    layers.Dense(100, activation=tf.nn.relu, input_shape=[input_size,]),
    layers.Dropout(0.2),
    layers.Dense(50, activation=tf.nn.relu),
    layers.Dropout(0.2),
    layers.Dense(1, activation=tf.nn.sigmoid)
])
+++++++++++ 3 +++++++++++
modelNN = keras.Sequential([
    layers.Dense(200, activation=tf.nn.relu, input_shape=[input_size,]),
    layers.Dropout(0.2),
    layers.Dense(100, activation=tf.nn.relu),
    layers.Dropout(0.2),
    layers.Dense(50, activation=tf.nn.relu),
    layers.Dropout(0.2),        
    layers.Dense(1, activation=tf.nn.sigmoid)
])

"""
def createNN(input_size):
    modelNN = keras.Sequential([
        layers.Dense(200, activation=tf.nn.relu, input_shape=[input_size,]),
        layers.Dropout(0.2),
        layers.Dense(100, activation=tf.nn.relu),
        layers.Dropout(0.2),
        layers.Dense(50, activation=tf.nn.relu),
        layers.Dropout(0.2),        
        layers.Dense(1, activation=tf.nn.sigmoid)
    ])
    modelNN.compile(loss='binary_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])
    return modelNN

In [None]:
kFold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
accs = []
early_stopCV = keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)

for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print("Fold:", i)
    X_trainCV, X_testCV = X.iloc[train_idx], X.iloc[test_idx]
    y_trainCV, y_testCV = y.iloc[train_idx], y.iloc[test_idx]

    modelNN = createNN(len(X_train.keys()))

    modelNN.fit(
        X_trainCV,
        y_trainCV,
        epochs=100,
        verbose=0,
        callbacks=[early_stopCV]
    )

    _, acc = modelNN.evaluate(X_testCV, y_testCV, verbose=0)
    print('Accuracy:', acc)
    accs.append(acc)        
    print()
print("Mean acc:", np.mean(accs))

In [None]:
epochs = 100
history = modelNN.fit(
    X_train,
    y_train,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stop]
)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
plot_history(history)

In [None]:
loss, acc = modelNN.evaluate(X_test, y_test, verbose=0)
print('Accuracy on test set:', acc)

In [None]:
probs = modelNN.predict(X_test)
preds = (probs > 0.5)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
fprNN, tprNN, thresholdNN = roc_curve(y_test, probs, pos_label=1)
print("AUC:", auc(fprNN, tprNN))
i = np.arange(len(tprNN))
roc = pd.DataFrame({'tf': pd.Series(tprNN-(1-fprNN), index=i), 'threshold': pd.Series(thresholdNN, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print('Best thresh:', list(roc_t['threshold']))

In [None]:
preds = (probs >= 0.39237307012081146) ## <=== mid way between best thresh and 0.5, yields good results
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(confusion_matrix(y_test, preds))

In [None]:
((0.5 - 0.2847461402416229) / 2) + 0.2847461402416229

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fprLR, tprLR)
plt.plot(fprSVC, tprSVC)
plt.plot(fprRF, tprRF)
plt.plot(fprXGB, tprXGB)
plt.plot(fprNN, tprNN)
plt.legend(['LR','SVC','RF','XGB','NN'])
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1])
# show the plot
plt.show()

# Let's work on the test set now

In [None]:
titanic_test = pd.read_csv('datasets/titanic/test.csv')

In [None]:
titanic_test.head()

In [None]:
titanic_test.describe()

In [None]:
titanic_test.info()

In [None]:
missing_val_count_by_column = (titanic_test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
### do almost same transf on test set's misssing vals
titanic_test['Age'] = imputer.transform(titanic_test['Age'].values.reshape(-1, 1))

In [None]:
### Q: CAN WE ASSUME THAT THE DIST OF AGES IN TRAIN SET IS THE SAME AS IN TEST SET? 
### I.E. SAME TRASNFORMATIONS CAN BE APPLIED?
titanic_test[titanic_test['Fare'].isnull()]

In [None]:
titanic_test.at[152, 'Fare'] = 14.454200 
### we are imputing w the median, 
### the mean differs too much from this val this is because the dist is skewed
### if we imputed w the mean, maybe a diff prediction would result and thus another acc

In [None]:
### Drop unnecessary cols
passenger_ids = titanic_test['PassengerId']
unnecessary_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
titanic_test.drop(unnecessary_cols, inplace=True, axis=1)
titanic_test = pd.get_dummies(titanic_test)

In [None]:
titanic_test.head()

In [None]:
X.head()

In [None]:
### scale num vars in test set
titanic_test['Age'] = ageSC.transform(titanic_test['Age'].values.reshape(-1, 1))
titanic_test['Fare'] = ptFare.transform(titanic_test['Fare'].values.reshape(-1, 1))

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(titanic_test['Age'])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(titanic_test['Fare'])
plt.show()

In [None]:
titanic_test.head()

#### Up to here, the test set is ready for prediction

In [None]:
preds = modelRF.predict(titanic_test)

In [None]:
outputRF = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': preds})

In [None]:
outputRF.head()

In [None]:
outputRF.shape

In [None]:
outputRF.to_csv('results/titanic_sub_rf.csv', index=False)

In [None]:
preds = modelXGB.predict(titanic_test)
outputXGB = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': preds})
outputXGB.to_csv('results/titanic_sub_xgb.csv', index=False)

In [None]:
probs = modelNN.predict(titanic_test)
preds = []
for proba in probs:
    if proba >= 0.5:
        preds.append(1)
    else:
        preds.append(0)

In [None]:
outputNN = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': preds})
outputNN.to_csv('results/titanic_sub_nn.csv', index=False)

### Submit those 3 files to Kaggle. They were generated without thresholding the model's output