## Section 1 - Data Exploration

The first step is to import needed libraries.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import string # library used to deal with text data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library
pd.set_option('display.max_columns', 100) # Setting pandas to display a N number of columns
pd.set_option('display.max_rows', 10) # Setting pandas to display a N number rows
pd.set_option('display.width', 100) # Setting pandas dataframe display width to N

from scipy import stats # statistical library
from statsmodels.stats.weightstats import ztest # statistical library for hypothesis testing
import plotly.graph_objs as go # interactive plotting library
import plotly.express as px # interactive plotting library
from itertools import cycle # used for cycling colors at plotly graphs
import matplotlib.pyplot as plt # plotting library
import pandas_profiling # library for automatic EDA

# installing and importing autoviz, another library for automatic data visualization
from autoviz.AutoViz_Class import AutoViz_Class
from IPython.display import display # display from IPython.display
from itertools import cycle # function used for cycling over values

# installing ppscore, library used to check non-linear relationships between our variables
import ppscore as pps # importing ppscore

%matplotlib inline

ImportError: cannot import name 'DataError' from 'pandas.core.base' (C:\Users\sky\.conda\envs\python\lib\site-packages\pandas\core\base.py)

In [None]:
# Input data files are available in the read-only "data/input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
print("")
for dirname, _, filenames in os.walk('data/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Check : os.walk
for files in os.walk('data'):
    print(files)

In [None]:
# Importing the data and displaying some rows
df = pd.read_csv("data/input/train.csv")
display(df.head()) # df.head()

##### pandas_profiling

In [None]:
# The pandas profiling library is really useful on helping us 
# understand the data we're working on.
# It saves us some precious time on the EDA process.
report = pandas_profiling.ProfileReport(df)

In [None]:
# Let's now visualize the report generated by pandas_profiling.
# display(report)
# Also, there is an option to generate an .HTML file containing all the information generated by the report.
report.to_file(output_file='data/Profiling_Report.html')

##### AutoViz

In [None]:
# Another great library for automatic EDA is AutoViz.
# With this library, several plots are generated with only 1 line of code.
# When combined with pandas_profiling, we obtain lots of information in a
# matter of seconds, using less than 5 lines of code.
AV = AutoViz_Class()

# Let's now visualize the plots generated by AutoViz.
report_2 = AV.AutoViz(filename="data/input/train.csv", verbose=2, save_plot_dir="data/")

### More Exploration
First, let's take a look at the differences between the ages of both groups, using a Violin plot.

In [None]:
# Creating different datasets for survivors and non-survivors
df_survivors = df[df['Survived'] == 1]
df_nonsurvivors = df[df['Survived'] == 0]

display(df_survivors.head())
display(df_nonsurvivors.head())

##### plotly.graph_objs -> go

In [None]:
# Filling in the data inside the Violin Objects 
# (import plotly.graph_objs as go # interactive plotting library)
violin_survivors = go.Violin(
    y=df_survivors['Age'],
    x=df_survivors['Survived'],
    name='Survivors',
    marker_color='forestgreen',
    box_visible=True)

violin_nonsurvivors = go.Violin(
    y=df_nonsurvivors['Age'],
    x=df_nonsurvivors['Survived'],
    name='Non-Survivors',
    marker_color='darkred',
    box_visible=True)

data = [violin_nonsurvivors, violin_survivors]

# Plot's Layout (background color, title, etc.)
layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title='"Age" of survivors vs Ages of non-survivors',
    xaxis=dict(
        title='Survived or not'
    ),
    yaxis=dict(
        title='Age'
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
# First distribution for the hypothesis test: Ages of survivors
dist_a = df_survivors['Age'].dropna()

# Second distribution for the hypothesis test: Ages of non-survivors
dist_b = df_nonsurvivors['Age'].dropna()

In [None]:
# Z-test: Checking if the distribution means 
# (ages of survivors vs ages of non-survivors) are statistically different
t_stat, p_value = ztest(dist_a, dist_b)
print("----- Z Test Results -----")
print("Test stat. = " + str(t_stat))
print("P value = " + str(p_value)) # P-value is less than 0.05

print("")

# T-test: Checking if the distribution means 
# (ages of survivors vs ages of non-survivors) are statistically different
t_stat_2, p_value_2 = stats.ttest_ind(dist_a, dist_b)
print("----- T Test Results -----")
print("Test stat. = " + str(t_stat_2))
print("P value = " + str(p_value_2)) # P-value is less than 0.05

#### "Gender" percentage from Survivors vs non-Survivors

In [None]:
# Check 
df_survivors['Gender'].value_counts()

In [None]:
# Taking the count of each Gender value inside the Survivors
df_survivors_Gender = df_survivors['Gender'].value_counts()
df_survivors_Gender = pd.DataFrame({'Gender':df_survivors_Gender.index, 'count':df_survivors_Gender.values})

# Taking the count of each Gender value inside the Survivors
df_nonsurvivors_Gender = df_nonsurvivors['Gender'].value_counts()
df_nonsurvivors_Gender = pd.DataFrame({'Gender':df_nonsurvivors_Gender.index, 'count':df_nonsurvivors_Gender.values})

In [None]:
display(df_survivors_Gender)

In [None]:
# Creating the plotting objects
pie_survivors_Gender = go.Pie(  
   labels = df_survivors_Gender['Gender'],
   values = df_survivors_Gender['count'],
   domain=dict(x=[0, 0.5]),
   name='Survivors',
   hole = 0.5,
   marker = dict(colors=['violet', 'cornflowerblue'], line=dict(color='#000000', width=2))
)

pie_nonsurvivors_Gender = go.Pie(  
   labels = df_nonsurvivors_Gender['Gender'],
   values = df_nonsurvivors_Gender['count'],
   domain=dict(x=[0.5, 1.0]), 
   name='non-Survivors',
   hole = 0.5,
   marker = dict(colors=['cornflowerblue', 'violet'], line=dict(color='#000000', width=2))
)

data = [pie_survivors_Gender, pie_nonsurvivors_Gender]

# Plot's Layout (background color, title, annotations, etc.)
layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title='"Gender" percentage from Survivors vs non-Survivors',
    annotations=[dict(text='Survivors', x=0.18, y=0.5, font_size=15, showarrow=False),
                 dict(text='Non-Survivors', x=0.85, y=0.5, font_size=15, showarrow=False)]
)

fig = go.Figure(data=data, layout=layout)
fig.show()

#### "Pclass" percentage from Survivors vs non-Survivors

In [None]:
# Taking the count of each Pclass value inside the Survivors
df_survivors_pclass = df_survivors['Pclass'].value_counts()
df_survivors_pclass = pd.DataFrame({'Pclass':df_survivors_pclass.index, 
                                    'count':df_survivors_pclass.values})

# Taking the count of each Pclass value inside the Survivors
df_nonsurvivors_pclass = df_nonsurvivors['Pclass'].value_counts()
df_nonsurvivors_pclass = pd.DataFrame({'Pclass':df_nonsurvivors_pclass.index, 
                                       'count':df_nonsurvivors_pclass.values})

In [None]:
# Creating the plotting objects
pie_survivors_pclass = go.Pie(  
   labels = df_survivors_pclass['Pclass'],
   values = df_survivors_pclass['count'],
   domain=dict(x=[0, 0.5]),
   name='Survivors',
   hole = 0.5,
   marker = dict(colors=['#636EFA', '#EF553B', '#00CC96'], 
                 line=dict(color='#000000', width=2))
)

pie_nonsurvivors_pclass = go.Pie(  
   labels = df_nonsurvivors_pclass['Pclass'],
   values = df_nonsurvivors_pclass['count'],
   domain=dict(x=[0.5, 1.0]), 
   name='non-Survivors',
   hole = 0.5,
   marker = dict(colors=['#EF553B', '#00CC96', '#636EFA'], 
                 line=dict(color='#000000', width=2))
)

data = [pie_survivors_pclass, pie_nonsurvivors_pclass]

# Plot's Layout (background color, title, annotations, etc.)
layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title='"Pclass" percentage from Survivors vs non-Survivors',
    annotations=[dict(text='Survivors', x=0.18, y=0.5, 
                      font_size=15, showarrow=False),
                 dict(text='Non-Survivors', x=0.85, y=0.5, 
                      font_size=15, showarrow=False)]
)

fig = go.Figure(data=data, layout=layout)
fig.show()

#### "Fare" value of survivors vs "Fare" value of non-survivors

In [None]:
# Checking out the differences between Fare distribution 
# for survivors and non-survivors
fare_survivors_box = go.Box(  
   x=df_survivors['Fare'],
   name='Survivors',
   marker=dict(color='navy')
)

fare_nonsurvivors_box = go.Box(  
   x=df_nonsurvivors['Fare'],
   name='Non-Survivors',
   marker=dict(color='steelblue')
)
  
data = [fare_nonsurvivors_box, fare_survivors_box]

# Plot's Layout (background color, title, etc.)
layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title='"Fare" value of survivors vs "Fare" value of non-survivors',
    barmode='stack',
    xaxis=dict(
        title='Fare distribution'
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
# Third distribution for the hypothesis test - Fares of survivors
dist_c = df_survivors['Fare'].dropna()

# Fourth distribution for the hypothesis test - Fares of non-survivors
dist_d = df_nonsurvivors['Fare'].dropna()

# Z-test: Checking if the distribution means 
# (fares of survivors vs fares of non-survivors) are statistically different
t_stat_3, p_value_3 = ztest(dist_c, dist_d)
print("----- Z Test Results -----")
print("T stat. = " + str(t_stat_3))
print("P value = " + str(p_value_3)) # P-value is less than 0.05

print("")

# T-test: Checking if the distribution means 
# (fares of survivors vs fares of non-survivors) are statistically different
t_stat_4, p_value_4 = stats.ttest_ind(dist_c, dist_d)
print("----- T Test Results -----")
print("T stat. = " + str(t_stat_4))
print("P value = " + str(p_value_4)) # P-value is less than 0.05

#### PPS (Predictive Power Score)

In [None]:
matrix_df = pps.matrix(df)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
matrix_df = matrix_df.apply(lambda x: round(x, 2)) # Rounding matrix_df's values to 0,XX

sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.75, annot=True)

##

## Section 2 - Supervised Learning: Classification

Let's dive into the modeling part.  
Import the libraries we're going to use.

In [None]:
import re
import collections
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from category_encoders import TargetEncoder, LeaveOneOutEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, precision_recall_curve, average_precision_score, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from xgboost import XGBClassifier, plot_importance as plot_importance_xgb
from lightgbm import LGBMClassifier, plot_importance as plot_importance_lgbm

#### Feature Engineering  
Create new features based on the original features of our dataset.  
https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial   

In [None]:
# Creating a categorical variable for Ages
df['AgeCat'] = ''
df['AgeCat'].loc[(df['Age'] < 18)] = 'young'
df['AgeCat'].loc[(df['Age'] >= 18) & (df['Age'] < 56)] = 'mature'
df['AgeCat'].loc[(df['Age'] >= 56)] = 'senior'

# Creating a categorical variable for Family Sizes
df['FamilySize'] = ''
df['FamilySize'].loc[(df['SibSp'] <= 2)] = 'small'
df['FamilySize'].loc[(df['SibSp'] > 2) & (df['SibSp'] <= 5 )] = 'medium'
df['FamilySize'].loc[(df['SibSp'] > 5)] = 'large'

# Creating a categorical variable to tell if the passenger is alone
df['IsAlone'] = ''
df['IsAlone'].loc[((df['SibSp'] + df['Parch']) > 0)] = 'no'
df['IsAlone'].loc[((df['SibSp'] + df['Parch']) == 0)] = 'yes'

# Creating a categorical variable to tell if the passenger is a Young/Mature/Senior male 
# or a Young/Mature/Senior female
df['GenderCat'] = ''
df['GenderCat'].loc[(df['Gender'] == 'male') & (df['Age'] <= 21)] = 'youngmale'
df['GenderCat'].loc[(df['Gender'] == 'male') & ((df['Age'] > 21) & (df['Age']) < 50)] = 'maturemale'
df['GenderCat'].loc[(df['Gender'] == 'male') & (df['Age'] > 50)] = 'seniormale'
df['GenderCat'].loc[(df['Gender'] == 'female') & (df['Age'] <= 21)] = 'youngfemale'
df['GenderCat'].loc[(df['Gender'] == 'female') & ((df['Age'] > 21) & (df['Age']) < 50)] = 'maturefemale'
df['GenderCat'].loc[(df['Gender'] == 'female') & (df['Age'] > 50)] = 'seniorfemale'

Creating a categorical variable for the passenger's title  
Title is created by extracting the prefix before "Name" feature  
This title needs to be a feature because all female titles are grouped with each other  
Also, creating a column to tell if the passenger is married or not  
"Is_Married" is a binary feature based on the Mrs title. Mrs title has the highest survival rate among other female titles 

In [None]:
# Check : Name Feature
df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [None]:
# Check : Ticket Frequency
df1 = df.groupby('Ticket')['Ticket'].transform('count')
#df1 = df.groupby('Ticket')['Ticket']
df1

In [None]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1
df['Title'] = df['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df['Title'] = df['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

# Creating "Ticket Frequency" Feature
# There are too many unique Ticket values to analyze, so grouping them up by their frequencies makes things easier
df['Ticket_Frequency'] = df.groupby('Ticket')['Ticket'].transform('count')

df.head(5)

After creating new features, we can drop useless columns that we won't use in the training process.

In [None]:
def get_feature_names(df):
    # Splitting the target
    target = df['Survived']

    # Dropping unused columns from the feature set
    df.drop(['PassengerId', 'Survived', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)

    # Splitting categorical and numerical column dataframes
    categorical_df = df.select_dtypes(include=['object'])
    numeric_df = df.select_dtypes(exclude=['object'])

    # And then, storing the names of categorical and numerical columns.
    categorical_columns = list(categorical_df.columns)
    numeric_columns = list(numeric_df.columns)
    
    print("Categorical columns:\n", categorical_columns)
    print("\nNumeric columns:\n", numeric_columns)

    return target, categorical_columns, numeric_columns

target, categorical_columns, numeric_columns = get_feature_names(df)

#### Model training & Evaluation functions

After all the preprocessing, we are now ready to build and evaluate different Machine Learning models.  
First, let's create a function responsible for evaluating our classifiers on a test set we will create later.

In [None]:
# Function responsible for checking our model's performance on the test data
def testSetResultsClassifier(best_model_pipeline, x_test, y_test):
    results = []
    
    predictions = best_model_pipeline.best_estimator_.predict(x_test)

    # Metrics applied on Probabilistic models and GLMs (predicted_proba for 
    # probabilistic ones, decision_function for GLMs)
    predicted_probas_class1 = best_model_pipeline.best_estimator_.predict_proba(x_test)[:, 1]
    roc_auc = roc_auc_score(y_test, predicted_probas_class1)
    avg_precision = average_precision_score(y_test, predicted_probas_class1)
    
    # Universal metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    
    results.append(accuracy)
    results.append(precision)
    results.append(recall)
    results.append(f1)
    results.append(avg_precision)
    results.append(roc_auc)
    
    print("\n\n#------- Test set results (Best Classifier) -------#\n")
    print("Accuracy:", round(results[0], 3))
    print("Precision:", round(results[1], 3))
    print("Recall:", round(results[2], 3))
    print("F1-Score:", round(results[3], 3))
    print("Average Precision (Precision/Recall AUC):", round(results[4], 3))
    print("ROC_AUC:", round(results[5], 3))
    
    return results

#### Pipeline Construction

In [None]:
import random

# Now, we are going to create our Pipeline, fitting several different data preprocessing and modeling
# techniques inside a RandomSearchCV, to check which group of techniques has better performance.

# Building a Pipeline inside RandomSearchCV, responsible for finding the best model and it's parameters
def defineBestModelPipeline(df, target, numeric_columns, categorical_columns):
    
    # Splitting original data into Train and Test BEFORE applying transformations
    # Later in RandomSearchCV, x_train will be splitted into train/val sets
    # The transformations are going to be fitted specifically on the train set,
    # and then applied to both train/test sets. This way, information leakage is avoided!
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.10, random_state=42)
    
    
    # 1st -> #### ----------- Numeric Transformers ----------- ####
    
    # Here, we are creating different several different data transformation pipelines 
    # to be applied in our numeric features
    numeric_transformer_1 = Pipeline(steps=[('imp', IterativeImputer(max_iter=30, random_state=42)),
                                            ('scaler', MinMaxScaler())])
    
    numeric_transformer_2 = Pipeline(steps=[('imp', SimpleImputer(strategy='mean')), # or strategy='median'
                                            ('scaler', StandardScaler())])

    
    # 2nd -> #### ----------- Categorical Transformers ----------- ####
    
    # We are going to encode categorical features using LeaveOneOutEncoder and TargetEncoder.
    # Note: TargetEncoder uses the mean target values (probabilities for classification and 
    # continuous values for regression) of each category inside a column to encode them.
    
    # LeaveOneOutEncoder is an alternative to TargetEncoder. It implements an improvement on TargetEncoder's
    # "overfitting behaviour", by leaving 1 of the observations out of the mean calculation for a specific category.
    # Read about it here: https://towardsdatascience.com/stop-one-hot-encoding-your-categorical-variables-bbb0fba89809
    # Documentation: https://contrib.scikit-learn.org/category_encoders/leaveoneout.html
    
    categorical_transformer_1 = Pipeline(steps=[('frequent', SimpleImputer(strategy='most_frequent')),
                                                ('leaveoneout', LeaveOneOutEncoder(sigma=0.1))])
    
    categorical_transformer_2 = Pipeline(steps=[('frequent', SimpleImputer(strategy='most_frequent')),
                                                ('targetencoder', TargetEncoder(min_samples_leaf=3, smoothing=2))])
    
    
    # 3rd -> #### ----------- Combining both numerical and categorical data pipelines ----------- ####
    
    # Here, we are creating different ColumnTransformers, each one with a different numerical/categotical transformation
    data_transformations_1 = ColumnTransformer(transformers=[('num', numeric_transformer_1, numeric_columns),
                                                             ('cat', categorical_transformer_1, categorical_columns)])
    
    data_transformations_2 = ColumnTransformer(transformers=[('num', numeric_transformer_1, numeric_columns),
                                                             ('cat', categorical_transformer_2, categorical_columns)])
    
    data_transformations_3 = ColumnTransformer(transformers=[('num', numeric_transformer_2, numeric_columns),
                                                             ('cat', categorical_transformer_1, categorical_columns)])
    
    data_transformations_4 = ColumnTransformer(transformers=[('num', numeric_transformer_2, numeric_columns),
                                                             ('cat', categorical_transformer_2, categorical_columns)])
    
    
    # 4th -> #### ----------- Testing different data transformation steps and models inside RandomSearchCV ----------- ####
    
    # Finally, we are going to apply these different data transformations to RandomSearchCV,
    # trying to find the best imputing strategy, the best feature transformation strategy and the best model with it's respective parameters.
    # Below, we just need to initialize a Pipeline object with any transformations we want, on each of the steps.

    pipe = Pipeline(steps=[('data_transformations', data_transformations_1), # Initializing data transformation step by choosing any of the above
                           ('clf', SVC())]) # Initializing modeling step with any model object
                           #memory='cache_folder') -> Used to optimize memory when needed
    
    # Now, we define the hyperparameter grid that will be used by RandomSearchCV. It will randomly chose options 
    # for each step inside the dictionaries ('data transformations', 'feature_selection', 'clf' and clf's parameters). 
    # Then, for each chosen option, it will apply the transformations, train the chosen model and evaluate 
    # it in the validation fold of the cross validator we define. In the end of it's iterations, RandomSearchCV will return some metrics, 
    # such as the best pipeline, model results for all iterations and more.
    
    params_grid = [   
                    {'data_transformations': [data_transformations_1, data_transformations_2,
                                              data_transformations_3, data_transformations_4],
                     'clf': [RandomForestClassifier()],
                     'clf__n_estimators': [int(x) for x in np.linspace(5, 30, num=15)],
                     'clf__max_features': [None, "sqrt", "log2"],
                     'clf__max_depth': [int(x) for x in np.linspace(3, 10, num=5)],
                     'clf__random_state': [int(x) for x in np.linspace(1, 49, num=30)]},
                        
                    {'data_transformations': [data_transformations_1, data_transformations_2,
                                              data_transformations_3, data_transformations_4],
                     'clf': [LGBMClassifier()],
                     'clf__n_estimators': [int(x) for x in np.linspace(3, 20, num=10)],
                     'clf__max_depth': [int(x) for x in np.linspace(2, 8, num=6)],
                     'clf__learning_rate': np.linspace(0.1, 0.7)},
        
                    {'data_transformations': [data_transformations_1, data_transformations_2,
                                              data_transformations_3, data_transformations_4],
                     'clf': [XGBClassifier()],
                     'clf__n_estimators': [int(x) for x in np.linspace(3, 15, num=10)],
                     'clf__eta': np.linspace(0.1, 0.9),
                     'clf__max_depth': [int(x) for x in np.linspace(2, 7, num=5)],
                     'clf__gamma': np.linspace(0.1, 1),
                     'clf__lambda': np.linspace(0.1, 1)},
                ]
    
    # Now, we fit a RandomSearchCV to search over the grid of parameters defined above
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']
    
    # Creating our cross validator object with StratifiedShuffleSplit (5 folds).
    # Stratification assures that we split the data such that the proportions
    # between classes are the same in each fold as they are in the whole dataset
    cross_validator = StratifiedShuffleSplit(n_splits=5, train_size=0.8, test_size=0.2, random_state=7)
    
    # Creating the randomized search cv object and fitting it
    best_model_pipeline = RandomizedSearchCV(estimator=pipe, param_distributions=params_grid, 
                                             n_iter=50, scoring=metrics, refit='accuracy', 
                                             n_jobs=-1, cv=cross_validator, random_state=21,
                                             error_score='raise', return_train_score=False)

    best_model_pipeline.fit(x_train, y_train)
    
    # At last, we check the final results
    print("\n\n#--------- Best Data Pipeline found in RandomSearchCV ---------#\n\n", best_model_pipeline.best_estimator_[0])
    print("\n\n#--------- Best Classifier found in RandomSearchCV ---------#\n\n", best_model_pipeline.best_estimator_[1])
    print("\n\n#--------- Best Estimator's average Accuracy Score on CV (validation set) ---------#\n\n", best_model_pipeline.best_score_)
    
    return x_train, x_test, y_train, y_test, best_model_pipeline

In [None]:
# Calling the function above, returing train/test data and best model's pipeline
x_train, x_test, y_train, y_test, best_model_pipeline = defineBestModelPipeline(df, target, numeric_columns, categorical_columns)

# Checking best model's performance on test data
test_set_results = testSetResultsClassifier(best_model_pipeline, x_test, y_test)

Visual representation of the best pipeline found by RandomSearchCV

In [None]:
from sklearn import set_config
from sklearn.utils import estimator_html_repr

# Set config to 'diagram' so we can visualize pipelines/composite estimators
set_config(display='diagram')

# Visualization of the best estimator found by RandomSearchCV
best_model_pipeline.best_estimator_

In [None]:
best_model_pipeline.best_estimator_[0]

In [None]:
best_model_pipeline.best_estimator_[1]

#### Precision-Recall and ROC Curves  

Let's take a look at the Precision/Recall and ROC Curves of the best model in our separate test dataset.

In [None]:
# Transforming the test data
x_test = best_model_pipeline.best_estimator_[0].transform(x_test)

# Calculating precision/recall threshold values for Probabilistic models
precision, recall, thresholds_prc = precision_recall_curve(y_test, best_model_pipeline.best_estimator_[1].predict_proba(x_test)[:, 1])
closest_to_025_prc = np.argmin(np.abs(thresholds_prc - 0.25))    # Getting information about the points in the graph that 
closest_to_default_prc = np.argmin(np.abs(thresholds_prc - 0.5)) # are closer to the default threshold for predict_proba (0.5),
closest_to_075_prc = np.argmin(np.abs(thresholds_prc - 0.75))    # threshold 0.25 and threshold 0.75.

# Plotting the curve
plt.plot(precision, recall, label="Results, Pecision Recall Curve")
plt.plot(precision[closest_to_025_prc], recall[closest_to_025_prc], 'v', c='k', # Plotting the marker for threshold 0.25
         markersize=10, label="Threshold 0.25", fillstyle="none", mew=2)       
plt.plot(precision[closest_to_default_prc], recall[closest_to_default_prc], 's', c='k', # Plotting the marker for threshold 0.5 (default)
         markersize=10, label="Default threshold (0.5)", fillstyle="none", mew=2)        
plt.plot(precision[closest_to_075_prc], recall[closest_to_075_prc], '^', c='k', # Plotting the marker for threshold 0.75 
         markersize=10, label="Threshold 0.75", fillstyle="none", mew=2)
plt.title("Precision-Recall Curve for the best model found")
plt.xlabel("Precision: TP / (TP + FP)")
plt.ylabel("Recall: TP / (TP + FN)")
plt.legend(loc="best")

In [None]:
# Visualizing all results and metrics, from all models, obtained by the RandomSearchCV steps
df_results = pd.DataFrame(best_model_pipeline.cv_results_)

#pd.set_option('display.max_colwidth', None)
display(df_results.head(3))

In [None]:
# Visualizing all results and metrics obtained only by the best classifier, considering Accuracy score
display(df_results[df_results['rank_test_accuracy'] == 1])

In [None]:
# Visualizing all results and metrics obtained only by the best classifier, considering ROC_AUC score
display(df_results[df_results['rank_test_roc_auc'] == 1])

#### Plotting Feature Importances

In [None]:
feature_names_in_order = numeric_columns + categorical_columns
print(feature_names_in_order)

In [None]:
##### Plotting Feature Importances for Random Forests & XGBoost #####
feat_importances = pd.Series(best_model_pipeline.best_estimator_.named_steps['clf'].feature_importances_, 
                             index=feature_names_in_order)
feat_importances.nlargest(10).plot(kind='barh')

#### Prediction

In [None]:
# Importing data and displaying some rows
df_test = pd.read_csv("data/input/test.csv")

# Creating a categorical variable for Ages
df_test['AgeCat'] = ''
df_test['AgeCat'].loc[(df_test['Age'] < 18)] = 'young'
df_test['AgeCat'].loc[(df_test['Age'] >= 18) & (df_test['Age'] < 56)] = 'mature'
df_test['AgeCat'].loc[(df_test['Age'] >= 56)] = 'senior'

# Creating a categorical variable for Family Sizes
df_test['FamilySize'] = ''
df_test['FamilySize'].loc[(df_test['SibSp'] <= 2)] = 'small'
df_test['FamilySize'].loc[(df_test['SibSp'] > 2) & (df_test['SibSp'] <= 5 )] = 'medium'
df_test['FamilySize'].loc[(df_test['SibSp'] > 5)] = 'large'

# Creating a categorical variable to tell if the passenger is alone
df_test['IsAlone'] = ''
df_test['IsAlone'].loc[((df_test['SibSp'] + df_test['Parch']) > 0)] = 'no'
df_test['IsAlone'].loc[((df_test['SibSp'] + df_test['Parch']) == 0)] = 'yes'

# Creating a categorical variable to tell if the passenger is a Young/Mature/Senior male or a Young/Mature/Senior female
df_test['GenderCat'] = ''
df_test['GenderCat'].loc[(df_test['Gender'] == 'male') & (df_test['Age'] <= 21)] = 'youngmale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'male') & ((df_test['Age'] > 21) & (df_test['Age']) < 50)] = 'maturemale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'male') & (df_test['Age'] > 50)] = 'seniormale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'female') & (df_test['Age'] <= 21)] = 'youngfemale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'female') & ((df_test['Age'] > 21) & (df_test['Age']) < 50)] = 'maturefemale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'female') & (df_test['Age'] > 50)] = 'seniorfemale'

In [None]:
# Importing data and displaying some rows
df_test = pd.read_csv("data/input/test.csv")

# Creating a categorical variable for Ages
df_test['AgeCat'] = ''
df_test['AgeCat'].loc[(df_test['Age'] < 18)] = 'young'
df_test['AgeCat'].loc[(df_test['Age'] >= 18) & (df_test['Age'] < 56)] = 'mature'
df_test['AgeCat'].loc[(df_test['Age'] >= 56)] = 'senior'

# Creating a categorical variable for Family Sizes
df_test['FamilySize'] = ''
df_test['FamilySize'].loc[(df_test['SibSp'] <= 2)] = 'small'
df_test['FamilySize'].loc[(df_test['SibSp'] > 2) & (df_test['SibSp'] <= 5 )] = 'medium'
df_test['FamilySize'].loc[(df_test['SibSp'] > 5)] = 'large'

# Creating a categorical variable to tell if the passenger is alone
df_test['IsAlone'] = ''
df_test['IsAlone'].loc[((df_test['SibSp'] + df_test['Parch']) > 0)] = 'no'
df_test['IsAlone'].loc[((df_test['SibSp'] + df_test['Parch']) == 0)] = 'yes'

# Creating a categorical variable to tell if the passenger is a Young/Mature/Senior male or a Young/Mature/Senior female
df_test['GenderCat'] = ''
df_test['GenderCat'].loc[(df_test['Gender'] == 'male') & (df_test['Age'] <= 21)] = 'youngmale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'male') & ((df_test['Age'] > 21) & (df_test['Age']) < 50)] = 'maturemale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'male') & (df_test['Age'] > 50)] = 'seniormale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'female') & (df_test['Age'] <= 21)] = 'youngfemale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'female') & ((df_test['Age'] > 21) & (df_test['Age']) < 50)] = 'maturefemale'
df_test['GenderCat'].loc[(df_test['Gender'] == 'female') & (df_test['Age'] > 50)] = 'seniorfemale'

In [None]:
# Creating a categorical variable for the passenger's title
# Title is created by extracting the prefix before "Name" feature
# This title needs to be a feature because all female titles are grouped with each other
# Also, creating a column to tell if the passenger is married or not
# "Is_Married" is a binary feature based on the Mrs title. Mrs title has the highest survival rate among other female titles
df_test['Title'] = df_test['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df_test['Is_Married'] = 0
df_test['Is_Married'].loc[df_test['Title'] == 'Mrs'] = 1
df_test['Title'] = df_test['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df_test['Title'] = df_test['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

# Creating "Ticket Frequency" Feature
# There are too many unique Ticket values to analyze, so grouping them up by their frequencies makes things easier
df_test['Ticket_Frequency'] = df_test.groupby('Ticket')['Ticket'].transform('count')

# Dropping unnecessary columns
df_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [None]:
# Applying best_model_pipeline
# Step 1 -> Transforming data the same way we did in the training set;
# Step 2 -> making predictions using the best model obtained by RandomSearchCV.
test_predictions = best_model_pipeline.best_estimator_.predict(df_test)

print(test_predictions)

In [None]:
# Generating predictions file that is going to be submitted to the competition
df_submission = pd.read_csv("data/input/test.csv")

# Adding a column with predicted values
df_submission['Survived'] = test_predictions

# Selecting only needed columns
df_submission.drop(df_submission.columns.difference(['PassengerId', 'Survived']), axis=1, inplace=True)

df_submission.head(10)

In [None]:
# Checking if the number of rows is OK (the file is expected to have 418 rows)
df_submission.count()

In [None]:
# Writing submitions to CSV file
df_submission.to_csv('submission.csv', index=False)