In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1) Import Datasets

In [None]:
#import titanc training data
titanic_train = pd.read_csv('../input/titanic/train.csv')
titanic_test = pd.read_csv('../input/titanic/test.csv')
titanic = titanic_train.append(titanic_test,ignore_index=True)
#view the data
titanic.head(10)

In [None]:
titanic.info()

# 2) Explorarory Data Analysis (EDA)

In [None]:
#define function to count survived for died for discrete varaiables and plot them
def count_survival(discrete_var,x_tick_rotation=0):
    unique_vals =titanic[discrete_var].dropna().unique()
    survival_df = pd.DataFrame()
    for i in unique_vals:
        df = titanic[titanic[discrete_var]==i]
        n_lived = df[df.Survived==1].shape[0]
        n_died = df[df.Survived==0].shape[0]
        survival_df = survival_df.append({'value':i,
                                          'died':n_died,
                                          'survived':n_lived},ignore_index=True)
    
    survival_df['survived_perc'] = survival_df['survived']/(survival_df['died']+survival_df['survived'])
    survival_df = survival_df.sort_values('survived_perc')
    
    #make a plot of survival data
    barWidth = 0.3
    surv_x_pos = np.arange(len(unique_vals))
    died_x_pos = [x + barWidth for x in surv_x_pos]
    
    #
    plt.bar(surv_x_pos, survival_df['survived'], width = barWidth, color = 'blue', edgecolor = 'black', label='survived')
    plt.bar(died_x_pos, survival_df['died'], width = barWidth, color = 'cyan', edgecolor = 'black', label='died')
    
    #labels for plot
    plt.xticks([r + barWidth/2 for r in range(len(unique_vals))], survival_df['value'],rotation =x_tick_rotation)
    plt.ylabel('Number of passengers')
    plt.legend()
    plt.title(f'Survival rate of titanic passengers by {discrete_var}')
    print(plt.show())
    


    
    return(survival_df)

In [None]:
 for var in ['Pclass','Sex','SibSp','Parch','Ticket','Cabin','Embarked']:
        x=count_survival(var)
        print(x)

In [None]:
#perform logistic regression on numerical variables
#this models probability of survival, as variable changes

for num_col in['Pclass','Age','SibSp','Parch','Fare']:
    plt.figure()
    sns.regplot(x=num_col, y="Survived", data=titanic, logistic=True)
    print(f"{num_col}--------------------------------------")
    plt.show()
    
#most dramatic increases are with fare

# 3) Imputation of missing data

In [None]:
##have a look at missing values
#263 passgengers with no age
#1014 passengers with no cabin
#2 with no embarkment location

#also 418 with no survival data (this is test data...)
titanic.isnull().sum()

In [None]:
#fill missing embarked data with most frequent - Southamption
titanic.Embarked = titanic.Embarked.fillna(titanic.Embarked.value_counts().index[0])

In [None]:
##Fill NAs in Cabin with unknown ('U')
titanic.Cabin = titanic.Cabin.fillna('U')

In [None]:
#We need to predict age for missing data 
#first extract titles - this can contain some age information
#extract titles from Name
titanic['title']=titanic.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
titanic.title.unique()

plt.figure(figsize = (15,8))
ax = sns.factorplot(x='title',y='Age',hue='Pclass',kind='point',data=titanic)
ax.set_xticklabels(rotation = 90)


In [None]:
titanic2 = titanic.copy()# make a copy of dataframe
titanic = pd.DataFrame() #make empty df to fill during loop
titles=  titanic2.title.unique()
for t in titles:
    df1 = titanic2[titanic2.title == t] 
    if df1.Age.isna().sum() >0:
        title_median = df1.Age.median()
        print(f"{t} has a median age of {title_median}")
        df1.Age.fillna(title_median,inplace=True)
    titanic = titanic.append(df1)
    

In [None]:
#we have a male in third class from Southhampton with an unknown Fare
titanic[titanic.Fare.isna()]
#make educated guess on his Fare
expected_fare = titanic[(titanic.Pclass ==3) & (titanic.Sex =='male') & (titanic.Embarked=='S')].Fare.median()
titanic.Fare = titanic.Fare.fillna(expected_fare)


In [None]:
#check NAs have been filled
titanic.isnull().sum()

# 4) Feature Engineering

In [None]:
#Cabin contains data on where in ship passangers may have ben located
#But only first letter is really of interest
titanic.Cabin= titanic.Cabin.map(lambda x: x[0]) #extract just first letter

count_survival('Cabin')

In [None]:
#Ticket has MANY unique values, with the most frequent representing 0.7% of the data
titanic['Ticket'].value_counts(normalize=True)
#Remove ticket column
titanic = titanic.drop(columns='Ticket')

In [None]:
titanic.head()

titanic.shape[0]

In [None]:
titanic.title.unique()

In [None]:
#Title feature was engineered earlier for age imputation
count_survival('title', x_tick_rotation=90)

In [None]:

title_groups = {
    "Capt":       "Military",
    "Col":        "Military",
    "Major":      "Military",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Dr",
    "Rev":        "Rev",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "common",
    "Mlle":       "common",
    "Ms":         "common",
    "Mr" :        "common",
    "Mrs" :       "common",
    "Miss" :      "common",
    "Master" :    "common",
    "Lady" :      "Royalty"
}

titanic['title_group']= titanic['title'].map(title_groups)
count_survival('title_group', x_tick_rotation=90)

In [None]:
#generate categorical age groupings
titanic['age_group']= pd.cut(titanic.Age,bins = [0,3,17,60,99],
                           labels=['baby','child','adult','elderly'] )

count_survival('age_group')

In [None]:
# size of families (including the passenger)
titanic['family_size'] = titanic.Parch + titanic.SibSp + 1

In [None]:
#strangely family size doesn't have much effect overall - but how does age-group impact this...
for i in titanic.age_group.dropna().unique():
    df = titanic[titanic.age_group==i]
    plt.figure()
    sns.regplot(x='family_size', y="Survived", data=df, logistic=True)
    print(f"Effect of family size on survival of {i}--------------------------------------")
    plt.show()

In [None]:
titanic['gender_agegroup']=titanic["Pclass"].apply(str).str.cat(titanic['age_group'], sep ="class_")
count_survival('gender_agegroup',x_tick_rotation=90)

In [None]:
titanic['gender_class']=titanic["Pclass"].apply(str).str.cat(titanic['Sex'], sep ="class_")
count_survival('gender_class',x_tick_rotation=45)

# 5) ML preprocessing

In [None]:
ml_input = titanic.drop(columns=['Name'])#name no longer need - would be far too 
ml_input.head(10)

In [None]:
#define features as categorical or numerical
cat_cols = ['Cabin','Embarked','title_group','age_group','gender_agegroup','gender_class','title']
num_cols = ['Pclass','Age','SibSp', 'Parch','Fare','family_size']

#define gender as binary integer
ml_input.Sex = ml_input.Sex.map({"male": 0, "female":1})

ml_input = pd.get_dummies(ml_input, columns=cat_cols)
ml_input.head()

In [None]:
#scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(ml_input[num_cols])
ml_input[num_cols] = scaler.transform(ml_input[num_cols])

ml_input.head()



In [None]:
train_data = ml_input.dropna(subset=['Survived'])
validate_data = ml_input[titanic.Survived.isna()]

In [None]:
#not going to perfrom test-train split due to limited number of data points
#will be doing cross-validation instead!
X = train_data.drop(columns=['Survived','PassengerId'])
y= train_data.Survived

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score


"""rf_params = dict(
    max_depth = [n for n in range(9, 14)],
    min_samples_split = [n for n in range(4, 11)],
    min_samples_leaf = [n for n in range(2, 5)],
    n_estimators = [10],
)"""
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_samples_leaf=4,
            min_samples_split=8, min_weight_fraction_leaf=0.0,
            n_estimators=750, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
cv_scores = cross_val_score(rf, X, y, cv=10)
print(cv_scores.mean())
print(cv_scores)
rf.fit(X, y)


In [None]:
sorted_idx = rf.feature_importances_.argsort()
top_vars = sorted_idx[::-1][:40]
plt.figure(figsize = (8,10))
plt.barh(X.columns[top_vars], rf.feature_importances_[top_vars])
plt.xlabel("Random Forest Feature Importance")

In [None]:
from xgboost import XGBClassifier
xgc = XGBClassifier()
cv_scores = cross_val_score(rf, X, y, cv=10)
print(cv_scores.mean())
print(cv_scores)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_samples_leaf=4,
            min_samples_split=8, min_weight_fraction_leaf=0.0,
            n_estimators=750, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
clf3 = SVC(kernel='rbf', probability=True)
clf4 = XGBClassifier()
clf5 =  CatBoostClassifier(verbose=False)

eclf = VotingClassifier(estimators=[('lr', clf1), 
                                    ('rf', clf2), 
                                   ('svc',clf3),
                                   ('xgc',clf4),
                                   ('cat',clf5)], 
                        voting='hard')


params = {'svc__kernel': ['poly','linear','rbf','sigmoid']} 

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=8)

grid=grid.fit(X,y)

print("Best score: {}".format(grid.best_score_))



In [None]:
#use our ensembl classifier to predict whether passengers survive
predictions = grid.predict(validate_data[X.columns])


In [None]:
output = pd.DataFrame({'PassengerId': validate_data.PassengerId,
                       'Survived': predictions.astype(int)})
output.to_csv('submission.csv', index=False)