# Library Imports + Basic Setup

In [1]:
# std python utility
from pathlib import Path
from collections import Counter
import os
import re
# ds utility
import pandas
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import sklearn
# ds modelling
# import sklearn

import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
pd.set_option('display.max_columns', None)

In [2]:
# directory setup
ROOT = Path(os.getcwd())
DATA_DIR = ROOT / "input"
PLOT_DIR = ROOT / "plots"

display(ROOT, DATA_DIR)
os.listdir(DATA_DIR)

PosixPath('/home/hanz/github/kaggle-titanic')

PosixPath('/home/hanz/github/kaggle-titanic/input')

['train.csv', 'test.csv', 'gender_submission.csv']

# Data Import and Overview

In [3]:
# Load datasets
train = pd.read_csv(DATA_DIR/"train.csv")
test = pd.read_csv(DATA_DIR/"test.csv")
# fix common lgbm error with not supporting JSON chars in feature name
# reference: https://stackoverflow.com/questions/60582050/lightgbmerror-do-not-support-special-json-characters-in-feature-name-the-same
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

train_shape = train.shape
test_shape = test.shape
feat_cols = train.columns.to_list()
# surface level inspection of the training data loaded
display(train.head())
# display general info about the dataset
display(train.info())
# observe general stats of feat-cols 
display(train.describe())
display("Train shape:",train_shape)
display("Test shappe:",test_shape)
display(feat_cols)
print("Number of features:",len(feat_cols))

NameError: name 're' is not defined

In [None]:
# assess missing values in one line
# percentage of missing values represented on the range of [0,1]
pd.isnull(train).apply(sum) / len(train)

In [None]:
sns.scatterplot(x=train.Survived, y=train.Age)

## Exploration of Covariance of 2 Random Discrete 

### Context

### Calculations

In [None]:
sns.scatterplot(x=train.Sex, y=train.Survived)

In [None]:
train.Sex[:2]
train.loc[train["Sex"] == "male", "sex_encode"] = 1
train.loc[train["Sex"] == "female", "sex_encode"] = 0
train.sex_encode[:2]

In [None]:
sex_survived_cov = np.cov(train.sex_encode, train.Survived)
sex_survived_cov

In [None]:
sex_survived_corr = np.corrcoef(train.sex_encode, train.Survived)
sex_survived_corr

$Corr(X,Y) = \frac{Cov(X,Y)}{\sigma_X \cdot \sigma_Y}$

In [None]:
sex_survived_corr_calc = sex_survived_corr / np.dot(np.std(train.sex_encode), np.std(train.Survived))
sex_survived_corr_calc

In [None]:
sex_survived_corr_calc_norm = sex_survived_corr_calc / sex_survived_corr_calc.max(axis=0)
sex_survived_corr_calc_norm

### Analysis

## Variable Distribution

In [None]:
sns.displot(train, x="Age", hue="Survived", multiple="stack", fill=True)

In [None]:
sns.displot(train, x="Age", kind="kde", hue="Survived", multiple="stack", fill=True)

In [None]:
sns.histplot(x=train.Age, y=train.Survived, stat="frequency")

In [None]:
# remove passengerId which is simply used for maintaining training instance indices ie an non impactful column in terms of the classificaiton problem
train_ids = train["PassengerId"]
train = train.drop(labels="PassengerId",axis=1)
# 
feat_cols = train.columns.to_list()
display(feat_cols)
print("Number of features:",len(feat_cols))

In [None]:
sample_submission = pd.read_csv(DATA_DIR / "gender_submission.csv")
sample_submission.head() # classification problem of predicting the target feat-col: "survived"

# Data Description + Plan  of Action
| Variable | Definition | Key |
|----------|-----------|------|
|survival	|Survival|	0 = No, 1 = Yes|
|pclass	|Ticket class|	1 = 1st, 2 = 2nd, 3 = 3rd|
|sex	|Sex	||
|age|	Age in years	||
|sibsp|	# of siblings / spouses aboard the Titanic	||
|parch	|# of parents / children aboard the Titanic	||
|ticket	|Ticket number	||
|fare|	Passenger fare	||
|cabin|	Cabin number	||
|embarked|	Port of Embarkation	|C = Cherbourg, Q = Queenstown, S = Southampton|

## Variable Notes
pclass: A proxy for socio-economic status (SES)<br>
1st = Upper <br>
2nd = Middle<br>
3rd = Lower<br>

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5<br>

sibsp: The dataset defines family relations in this way...<br>
Sibling = brother, sister, stepbrother, stepsister<br>
Spouse = husband, wife (mistresses and fiancés were ignored)<br>

parch: The dataset defines family relations in this way...<br>
Parent = mother, father<br>
Child = daughter, son, stepdaughter, stepson<br>
Some children travelled only with a nanny, therefore parch=0 for them.<br>


## Plan of Action
Objective: Predict the target variable "survived" using the classification problem's data set "train.csv" provided.

I. Self-Attempt  
  1. Inspect in further detail of the given features via plots  
  2. Evaluate methods of plotting and select appropriate method for the data type  
  3. Look out for anomalies such as: non-Gaussian distros, outliers  
  4. Observe variables' dependencies/relations using covar matrix -- a "heat map"  
  5. Understand data types to be worked with  
  
II. Review Initial Attempt and Identify Notebooks for Reference  

III. Assisted-Attempt using Reference Notebooks    
  1. Apply ideas in notebook
  2. Review  
    * Justifications for choices made  
    * Tools used  

## Feature vs Target Feature Plots

In [None]:
# attempt to plot target variable
f, ax = plt.subplots(figsize=(8, 7))
sns.distplot(train['Survived'], color="b");
ax.set(ylabel="Frequency")
ax.set(xlabel="Survived")
ax.set(title="Survival distribution")
# sns.despine(trim=True, left=True)
plt.grid(True)
ax.xaxis.grid(False)

plt.show()
print("Skewness: %f" % train['Survived'].skew())
print("Kurtosis: %f" % train['Survived'].kurt())

The chosen distribution plot method does not appear to display the data in a manner that is meaningful -- perhaps there isn't much to be drawn from the individual target variable itself outside of the fact that more people died than those that survived. Alternative plot methods could be either a scatter plot or a box plot. The double distribution curves are expected given the discrete nature of the variable ( 0 or 1 ie dead or alive)

In [None]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in train.columns:
    if train[i].dtype in numeric_dtypes:
#         if i in ['TotalSF', 'Total_Bathrooms','Total_porch_sf','haspool','hasgarage','hasbsmt','hasfireplace']:
#             pass
#         else:
        numeric.append(i)     

# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(12,12))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(train[numeric]), 1):
    plt.subplot(len(list(numeric)), 3, i)
    sns.scatterplot(x=feature, y='Survived', hue='Survived', palette='Blues', data=train)
    plt.xlabel('{}'.format(feature), size=15,labelpad=0)
    plt.ylabel('Survived', size=15, labelpad=14)
    for j in range(2):
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
    plt.legend(loc='best', prop={'size': 10})
plt.show()


Not sure what conclusions I can draw given these plots. Perhaps a boxplot

In [None]:
# data = pd.concat([train['Survived'], train['Age']], axis=1)
# f, ax = plt.subplots(figsize=(8,8))
# fig = sns.boxplot(x=train['Age'], y="Survived", data=data)
# plt.xticks(rotation=90)
# # fig.axis(ymin=train['Survived'].min(), ymax=train['Survived'].max());
# plt.show()


Still uncertain about what to make of the data. The rationale for plotting these particular features were based upon a linear regression problem in which numeric datatypes were plotted. In this particular case, it's clear that plotting numerics against a label does not show all too much. Perhaps a distribution plot applied to all features would be more useful?

In [None]:

# # visualising some more outliers in the data values
# fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(12,12))
# plt.subplots_adjust(right=2)
# plt.subplots_adjust(top=2)
# sns.color_palette("husl", 8)
# for i, feature in enumerate(list(train[feat_cols]), 1):
#     plt.subplot(len(list(numeric)), 3, i)
#     g = sns.distplot(train[feature], color="m", label="Skewness : %.2f"%(train["Fare"].skew()))
#     g = g.legend(loc="best")
# plt.show()


# Self Attempt
## Feature vs Target Feature Analysis
* pclass vs survived
    * there appears to be a relation between passenger socioeconomic class and their rate of survival
    * upper-class passengers are more inclined to survive in contrast to lower-class passengers
* sex vs survived
    * women were far more likely to survive in contrast to males
* age vs survived
    * there appears to be a bit of a relation; however, requires further investigation
* siblings & spouses VS surived
    * passengers who had a larger party (spouses or siblings) were less inclined to survive
    * the cut off point appears to be at 2 siblings & spouses at which there appears to be a large drop off in the survival rate
* parch (# parents/children) vs survived:
    * no parent or child case appears to potentially be an outlier -- not sure about the reasonings as to why
    * the case of 1/2/3 parch sizes appear to make sense in terms of the survival rate of the passenger
    * not sure what to make meaining behind the portruding line drawn
* ticket number & survived:
    * there may be some information here in regards to how passengers were seated upon the ship; however, given the uniqueness of ticker numbers there cannot be any direct conclusions made about the relation between a passengers ticket nubmer and their survival rate
* fare vs survived
    * there is likely a relation here based upon the observations of pclass vs survival rate
    * a closer inspection is needed
* cabin vs survived
    * it is possible that there may be some correlation between survival rate given a passengers cabin number
    * this correlation is motivated by an understanidng of how ships hold multiple levels of cabins thus, the passengers at the lowest level cabins would likely not have been able to make it to the rafts sooner than the ones at the upper level
    * cabin and pclass may share a relation
    * cabin and fare may share a relation
* embarked vs survived
    * naturally inclined to think that the port of embarkation is likely to have little to no relation with survival rate
    * a case for a relation to exist would be that the location may represent a culture of alertness to dangers suggesting a better prepared passenger for moments of danger
    * what is the significance of the location of embarkation?
    * what is the reason for passengers who embarked at Cherbourg having a higher survival chance?
* survived vs survived
    * NA
* name vs survived
    * NA

## Feature Correlation Matrix

In [None]:
corr = train.corr()
plt.subplots(figsize=(15,12))
sns.heatmap(corr, vmax=0.9, cmap="coolwarm", square=True, annot= True)


* We can see that there's a positive correlation between the training set's "Fare" and "Survived" rand variables
* "Parch" and "Fare" are very slightly positively correlated
* The rest of the features hold a slight negative correlation with "Survived"

## Feature Imputation
Does there exist any missing values? How many? How do we appropriately fill in for these missing values?

In [None]:
# Missing Features in Training Dataset
all_cols = train.isna().any()
na_cols = [c for c in all_cols.index if all_cols[c] == True]
display("All columns:",train.isna().any(), "Missing cols:", na_cols)
print()
for c in na_cols:
    na_count = train[c].isna().sum()
    total = len(train[c])
    na_percent =  na_count / total * 100
    print("{:<10}: {:<5} / {:<5} instances missing --> {:1.5f}".format(c, na_count, total, na_percent))

How do we go about feature imputation?'
* currently only observing just the training instances, need to impute for the set of ALL instances ie train + test dataset
* based upon the data description how would we create new features? consider 1 or more variables from which we can define a new feature>
    - for example: denoting whether the passenger is travelling alone or not
* apply KNN algorithm?


## Prospective Ideas
* Would we ever use PCA here? 
    - Considering that the dataset is extremely small in terms of the degree of dimnesions, there would be very little reason for applying a technique to reduce dimensionality
    - PCA may actually hurt the performance of the model as a result because there is already very few features in the dataset
    - Worth exploring to provide concreteness/assessing these claims
* Would it be effective to use a neural network for this problem? Why or why not? What would that look like?
    - Reference: https://www.kaggle.com/liyenhsu/titanic-neural-network#Ticket
* What does an approach using R look like? What are the tools involved? How do these tools differ from python? Are there python analogs of these tools?
    - Reference: https://www.kaggle.com/pliptor/divide-and-conquer-0-82296

# Self Attempt Review

## References for New Ideas/Perspectives
* <b>Python DS Framework</b>: https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
    * <b>Notable sections:</b>
        - Problem Definition
        - Data Preparation
        - Meet & Greet the Data (EDA)
    * <b>My Takeaways & Reflection from Reading</b>
        - Nearly all of my intuitons & observations up to this point of the jupyter notebook were on-point with the notes defined in the "Meet & Greet the Data" section
        - Good Practice? make a copy rather of the data rather than altering it directly?
        - <b>4Cs of Data Cleaning</b>
            1. Correcting: adjust outlier values
            2. Completing: feature imputation ie filling in the non-existing values
            3. Creating: feature engineering ie utilize existing features to create new ones
            4. Converting: convert to dummy variables ie hot encode (Ex. conversion using a kv-mapping of strings to integer vals)?
        - Performing feature imputation: usage of median & mode
            - why median over average for "Age" & "Fare"? 
            - why mode for "Embarked"?
            - could we utilize the set of all instances that do not have missing columns from which we train a model to predict for the instances that contain missing values?
        - Performing feature engineering
            - Why do we use pd.qcut? What does it do? What was the rationale for qcutting for the features denoted in the notebook?
* <b>Python Ensemble Methods</b>: https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling
    * <b>Notable Sections:</b>
        - Outlier Detection (Tukey Method)
    * <b>My Takeaways & Reflection from Reading</b>
        - Revealed my own weakness -- a need deepen my stats understaning in terms of the significance of outliers 
        - A need to understand methods of cleaning out data ie removal of outliers (what other forms exist outside of the Tukey method provided?)
            - Other outlier detection methods: https://towardsdatascience.com/practical-guide-to-outlier-detection-methods-6b9f947a161e
        - Usage of a <b>FacetGrid</b> from Seaborn in displaying age distribution and surival outcome
        - Usage of a <b>kdeplot</b> for overlaying the two plots pertaining to 2 survival outcomes on one chart
        - Approaching each datatype with the appropriate plotting method is so important
* <b>R Tutorial</b>: https://www.kaggle.com/mrisdal/exploring-survival-on-the-titanic
    * Organized very well
    * Provides alot of useful commentary for gaining insight into the steps taken

# Assisted Attempt
## Revisiting EDA: Feature vs Target Feature Analysis

In [None]:
train.info()

In [None]:
# create a dictionary dtype: column_names ie aggregate columns by their dtype
type_dict = {str(k): list(v) for k, v in train.groupby(train.dtypes, axis=1)}
type_dict

### Discrete Features (int64) vs Target Feature


In [None]:
# fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(12,12))
# plt.subplots_adjust(right=2)
# plt.subplots_adjust(top=2)
for i, c in enumerate([ _ for _ in type_dict["int64"] if _ != "Survived"],1):
#     plt.subplot(len(list([ _ for _ in type_dict["int64"]if _ != "Survived"])), 2, i)
    g = sns.factorplot(x=c,y="Survived",data=train,kind="bar", size = 6 , 
    palette = "muted")
    g.despine(left=True)
    g = g.set_ylabels("survival probability")
    plt.grid(axis="y")
    plt.title("Survived VS {}".format(c))
    fig = plt.gcf()
#     fig.set_size_inches = (8,8)
    plt.tight_layout()
    fig.savefig(PLOT_DIR/f"{c}-vs-Survived-factorplot.png",bbox_inches = "tight")
plt.show()
# print(plt.grid.__doc__)
# need to learn how to create subplots when using seaborn

* Pclass VS Survived
    * there appears to be a relation between passenger socioeconomic class and their rate of survival
    * upper-class passengers are more inclined to survive in contrast to lower-class passengers
* SubSp VS Surived
    * passengers who had a larger party (spouses or siblings) were less inclined to survive
    * the cut off point appears to be at 2 siblings & spouses at which there appears to be a large drop off in the survival rate
* Parch VS Survived:
    * no parent or child case appears to potentially be an outlier -- not sure about the reasonings as to why
    * the case of 1/2/3 parch sizes appear to make sense in terms of the survival rate of the passenger
    * not sure what to make meaining behind the portruding line drawn


### Continuos Features (float64) vs Target Feature 


In [None]:
for i, c in enumerate([ _ for _ in type_dict["float64"] if _ != "Survived"],1):
#     plt.subplot(len(list([ _ for _ in type_dict["int64"]if _ != "Survived"])), 2, i)
    g = sns.FacetGrid(train, col='Survived')
    g = g.map(sns.distplot, c)
    fig = plt.gcf()
#     fig.set_size_inches = (8,8)
    plt.tight_layout()
    fig.savefig(PLOT_DIR/f"{c}-vs-Survived-distplot.png",bbox_inches = "tight")
    plt.show()
    plt.title("Survived VS {}".format(c))
    g = sns.kdeplot(train[c][(train["Survived"] == 0) & (train[c].notnull())], color="Red", shade = True)
    g = sns.kdeplot(train[c][(train["Survived"] == 1) & (train[c].notnull())], ax =g, color="Blue", shade= True)
    g.set_xlabel(c)
    g.set_ylabel("Frequency")
    g = g.legend(["Not Survived","Survived"])
    fig = plt.gcf()
#     fig.set_size_inches = (8,8)
    plt.tight_layout()
    fig.savefig(PLOT_DIR/f"{c}-vs-Survived-kdeplot.png",bbox_inches = "tight")
    
    plt.show()

* age vs survived
    * there appears to be a bit of a relation; however, requires further investigation
    * revamping of the plot method used demonstrates that there is a strong correlation between age and survival 
    * there is large concentration of young passengers surviving
    * there also appears to be more older passengers that did not survive
* fare vs survived
    * there is likely a relation here based upon the observations of pclass vs survival rate
    * a closer inspection is needed


In [None]:
# Explore Fare distribution 
numeric_dypes = type_dict["float64"]
# type_dict["int64"] + 
# numeric_dtypes
type_dict["float64"]

In [None]:
for c in type_dict["float64"]:
# + type_dict["int64"]:
    g = sns.distplot(train[c], color="m", label="Skewness : %.2f"%(train[c].skew()))
    g = g.legend(loc="best")
    plt.title("{} Distribution".format(c))
    plt.show()

Should one immediately apply a log transformation to create a gaussian? Why or why not? 
* The referenced notebook: https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling applies a log transform on the feature "Fare". The justification is to reduce "...very high values in the model, even if it is scaled." What are the other perspectives to observe as a justification for the log transform?
* Age is not transformed in the referenced notebook in the prev bullet. Why not? It makes sense to not transform it because it'd lead to a misinterpretation of the data. It makes sense that there's a slightly higher proportion of young passengers surviving in contrast to older passengers who were more likely to no survive. An immediate conclusion to normalize the distro would obfuscate potential observations such as this.

In [None]:
type_dict["object"]

### String (object) vs Target Feature

In [None]:
# for i, c in enumerate([ _ for _ in type_dict["object"] if _ != "Name"],1):
col_strtype = ["Sex", "Embarked"]
for c in col_strtype: 
#     plt.subplot(len(list([ _ for _ in type_dict["int64"]if _ != "Survived"])), 2, i)
    plt.title("Survived VS {}".format(c))
    g = sns.barplot(x=c,y="Survived",data=train)
    g = g.set_ylabel("Survival Probability")
    plt.show()
    

* sex vs survived
    * women were far more likely to survive in contrast to males per the quote from the movie "Women and children first!"
* embarked vs survived
    * naturally inclined to think that the port of embarkation is likely to have little to no relation with survival rate
    * a case for a relation to exist would be that the location may represent a culture of alertness to dangers suggesting a better prepared passenger for moments of danger
    * what is the significance of the location of embarkation?
    * what is the reason for passengers who embarked at Cherbourg having a higher survival chance?


### Cross Analysis of Dtypes

In [None]:
# Explore Pclass vs Survived by Sex
g = sns.factorplot(x="Pclass", y="Survived", hue="Sex", data=train,
                   size=6, kind="bar", palette="muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")


Gender played a large role in determining survivability of a passenger. Women were more inclined to survive in contrast to men.  Regardless of controlling for SES, the trend of women being more inclined to survive was conserved. In additition, passengers of higher class were likely more likely to survive in contrast to lower SES passengers. 

In [None]:
# Explore Embarked vs Survived 
g = sns.factorplot(x="Embarked", y="Survived",  data=train,
                   size=6, kind="bar", palette="muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")


Why is it that those embarking from Cherbourg were more likely to survive?
* Were they primarily women?
* Where they primarily of higher SES?

#### Pclass vs Embarked 

In [None]:
g = sns.factorplot("Pclass", col="Embarked",  data=train,
                   size=6, kind="count", palette="muted")
g.despine(left=True)
g = g.set_ylabels("Count")

####  Sex vs Embarked 

In [None]:
g = sns.factorplot("Sex", col="Embarked",  data=train,
                   size=6, kind="count", palette="muted")
g.despine(left=True)
g = g.set_ylabels("Count")

1. Q: Were they primarily women? 
- A: No, those from Southampton (S) were primarily men.  Cherbourg & Queenstown had a fairly even distribution of genders at the respective locations.
2. Q: Were they primarily of higher SES?
- A: Not necessarily conclusive across the board. There is a slightly higher distribution of 1st class passengers for  Cherbourg in contrast to the other locations. It would make sense that those embarking from  Cherbourg had some form of influence.

Main Q: Why is it that those embarking from Cherbourg were more likely to survive?
A: Passengers embarking from Cherbourg showed a higher distribution for 1st class passengers out of all 3 classes. It is likely, although not confirmed, that passengers of 1st class are probably aristocrats or indivusals that hold influence warranting their priotization during evac.

## Feature Imputation: Revisited
Recalling,
```sh
Age: 177 / 891 instances missing --> 19.86532%
Cabin: 687 / 891 instances missing --> 77.10438%
Embarked: 2 / 891 instances missing --> 0.22447%
```
Considering the second round of EDA and now the need for imputing features. One should consider the entirety of the dataset. The reason for observing the entirety of the dataset. The test set was initially excluded from EDA since it does not contain the random target variable to be predicted. To analyze supporting random feature variables would not be useful during EDA since analysis revolves around understanding of related support features in conjunction with the rand target var. 

For the case of feature imputation it's now important to consider the entirety of dataset -- the set of all instances (train + test) because one must identify all features columns containing missing values. Identification of these instances' features for imputation is important because one is able to draw from a larger picture when deriving values for filling in missing features. In addition, we also fill in the blanks all in one sweep. 

In [None]:
test_ids = test["PassengerId"]
train["PassengerId"] = train_ids
# test = test.drop("PassengerId",axis=1)

In [None]:
display(test.head())
display(train.head())

In [None]:
full_data = pd.concat(objs=[train,test], axis=0).reset_index(drop=True)
display(train.shape, test.shape, full_data.shape)
# len(full_data)
# full_data.head()

Note that it's okay for the test data shape to be one less than the training because the test set does not contain the target variable "Survived". Do be careful when applying transformations. The current state of the notebook requires review regarding information leakage.
* What is an example of information leakage?
    - Filling in missing values in the training set using an average created from the entire dataset?
    - Is it consider best practice to exlcude instances from the test set and to use only the population of training instances for filling in the missing values in the training dataset?
* How should one go about handling the datasets to avoid information leakage?
    - Definition of a function to be applied to the sample sets (test & training) individually 
* Is the entire dataset the sum of all instances defined in both train and test?
* References:
    - https://www.kaggle.com/questions-and-answers/93428
    - https://www.kaggle.com/questions-and-answers/35689
    - https://www.kaggle.com/getting-started/97742
    - https://www.kaggle.com/questions-and-answers/40640
    - https://www.kaggle.com/questions-and-answers/48869
    - https://www.kaggle.com/getting-started/99070
    - https://www.kaggle.com/questions-and-answers/182431
    - https://www.kaggle.com/general/67299
    - https://www.kaggle.com/getting-started/142521

### 4C's Overview
1. Complete: impute values for missing cells
2. Correct: appropriate adjustment of distributions, removal of outliers
3. Create: generation of new features given existing features
4. Convert: hot encode ie conversion of object types to values accepted by the machine such as integer/float types 

### Correct

In [None]:
# Outlier detection 
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    
    Reference:  https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling
    """
    outlier_indices = []
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
# outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])

The function detect_outliers is potentially faulty because it does NOT drop NaNs when calculating the percentiles.
![image.png](attachment:image.png)

In [None]:
# print("Training set outliers:", len(train.loc[outliers_to_drop]))
# train.loc[outliers_to_drop]


In [None]:
# # Drop outliers
# train = train.drop(outliers_to_drop, axis = 0).reset_index(drop=True)

* How is not droping NaN values prior to calculating percentiles faulty? 
* What mistakes does one allow into the dataset by NOT dropping NaN values pre-percentiles calculations?
* Why is it important to drop NaN values?

### Complete
Similar to first approach we find all missing values for any column from which we are able to identify the columns in need of imputation.


In [None]:
## Join train and test sets in order to obtain the same number of features during categorical conversion
train_len = len(train)
full_data =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
# Fill empty and NaNs values with NaN
full_data = full_data.fillna(np.nan)
# Check for Null values
full_data.isnull().sum()


In [None]:
def show_missing_report(df:pandas.DataFrame):
    all_cols = df.isna().any()
    na_cols = [c for c in all_cols.index if all_cols[c] == True]
    if len(na_cols) == 0:
        return
    na_cols.remove("Survived") # random target feature column we DONT impute for
    display("All columns:",df.isna().any(), "Missing cols:", na_cols)
    print()
    for c in na_cols:
        na_count = df[c].isna().sum()
        total = len(df[c])
        na_percent =  na_count / total * 100
        print("{}: {} / {} instances missing --> {:1.5f}%".format(c, na_count, total, na_percent))
        

In [None]:
show_missing_report(full_data)        


#### Age
There exists 256 missing values in the whole dataset post-outliers removal. Given the prior observations from EDA, we know the feature age holds significance in passenger survival thus provides reason for keeping such a feature and suggests a need to fill in the blanks. An approach to imputing for this feature would be to observe the features that are most correlated with age. 

In [None]:
full_data.columns
# cols removed from plotting due to computational cost
insig_cols = ['PassengerId','Name',"Ticket","Fare","Cabin","Survived"] 
sig_cols = [ c for c in full_data.columns if c not in insig_cols]
sig_cols

In [None]:
PROPS = {
#     'boxprops':{'facecolor':'none', 'edgecolor':'white'},
    'medianprops':{'color':'magenta'},
    'whiskerprops':{'color':'grey'},
    'capprops':{'color':'white'}
}

for c in [_ for _ in sig_cols if _ != "Age"]:
    print(c)
    g = sns.factorplot(y="Age",x=c,data=full_data,kind="box",**PROPS)
    plt.show()



In [None]:
print(sig_cols) 
g = sns.heatmap(full_data[sig_cols].corr(),cmap="coolwarm",annot=True)
# does not plot "Sex" because the machine does not understand strings thus need to hot encode

In [None]:
# map applies a transform given the key to be transformed to the associated value
full_data["Sex"] = full_data["Sex"].map({"male":0, "female":1}) 
g = sns.heatmap(full_data[sig_cols].corr(),cmap="coolwarm",annot=True)


Age is least correlated with sex. For all other features, there is a fair amount of correlation with "Pclass" being the largest. Apply imputation given the similarity between the feature criteria consisting of Pclass, Parch & SibSp.

In [None]:
# Filling missing value of Age 

## Fill Age with the median age of similar rows according to Pclass, Parch and SibSp
# Indices of NaN age rows

# median if ordinal (ranked categories)
# or if skewed
# b/c mean would be affected given the distro

index_NaN_age = list(full_data["Age"][full_data["Age"].isnull()].index) 

for i in index_NaN_age:
    age_med = full_data["Age"].median()
    # search for an instance matching with the feature-criteria
    age_pred = full_data["Age"][(
        (full_data['SibSp'] == full_data.iloc[i]["SibSp"]) & 
        (full_data['Parch'] == full_data.iloc[i]["Parch"]) & 
        (full_data['Pclass'] == full_data.iloc[i]["Pclass"]))].median()
    # check if there exists an instance whose correlated features associate to an age
    if not np.isnan(age_pred): 
        print("Filled using predicted value:", age_pred) # DEBUG
        full_data['Age'].iloc[i] = age_pred
    # case in which there does not exist a matching instance of the feature criteria
    else:
        print("Filled using median value:", age_med) # DEBUG
        full_data['Age'].iloc[i] = age_med

In [None]:
# Assessing quality of age imputation
g = sns.factorplot(x="Survived", y = "Age",data = train, kind="box")
g = sns.factorplot(x="Survived", y = "Age",data = train, kind="violin")

Q: Why do we use a violin-plot here in contrast to a box-plot?<br>
A: Perhaps for better visualization of distributions? This makes sense since hte box-plot does not provide much assistance in contrasting surival rate given age. Using the violin-plot, we are able to better see taht there is a higher survival rate for those that are younger.<br>
* From the box-plot we can see that there is very little change in terms of the median age in those that survive and those that don't survive. 
* From the violin-plot we can see that there is a visible blip in the distribution denoting young passengers have a higher survival rate. 


In [None]:
show_missing_report(full_data)

Observing the remaining features to impute, we see that there are A LOT of missing instances for the cabin-feature. This makes me wonder if there is any relation/association between ticket numbers and cabin numbers. I doubt there would be any relation but it is worth at the very least taking a peek.

In [None]:
df_tix_cab= full_data[(
        (pd.notna(full_data['Cabin']) ) & 
        (pd.notna(full_data['Ticket'])))]
df_tix_cab.head(25)

#### Large Set of Missing Values: Cabin/Deck
No shot...<br>
Thus more referencing:
* Advance Feature Engineering: https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial
    * <b>Notable Sections:</b>
        - EDA (notably the approach for handling the "Cabin" feature)
        - Feature Engineering
    * <b>My Takeaways & Reflections on Reading:</b>
        - Much of the data preparation portion of data science is an application of statistics in conjunction with programming to carry out detective work -- to figure out mysteries (whether it be the problem to be solved, or dealing with values that are missing)
        - Usage of existing knowledge (regardless of acute apparence) ie knowledge that exists that one may/may-not know of, but have a means and are permitted access to use
        - Thinking outside of the box (in this instance -- outside the realms of programming & stats) ie a consideration of reasonable potential solutions that were not considered (google search for particular features -- NOT the target variable ie the direct answers to the main problem)
        <br>
        
        
<img src="Titanic_side_plan.png" style="float: left; margin-right: 10px;" />



In [None]:
full_data['Cabin'].unique()

In [None]:
regex = r"([a-zA-Z])\d?"
test = 'F G73'
# print(test.split(" "))
# [re.findall(regex,e) for e in test.split(" ")]
re.findall(regex,test)

In [None]:
# Create deck method 1
# Creating Deck column from the first letter of the Cabin column (M stands for Missing)
full_data['Deck'] = full_data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

df_all_decks = full_data.groupby(
    ['Deck','Pclass']).count().drop(
    columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 
    'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']).rename(
    columns={'Name': 'Count'}).transpose()
df_all_decks 

In [None]:
full_data.head()

In [None]:
full_data = full_data.drop("Deck", axis=1)

In [None]:
# original approach was tehcnically a faulty approach 
# because certain instances have multiple DIFFERENT cabin labels pertaining to one instance
# this can be fixed by writing an algo using a regex or a func that iterates through each instance
# splits them up followed by a check for the map of 
for i, row in full_data.iterrows():
    deck_str = row["Cabin"]
    # verify value is not null 
    if not pd.notnull(deck_str):
        deck_str = "M" # M denoting missing
        cabin = [deck_str]
    else:
        cabins = re.findall(regex,deck_str)
    cabins = re.findall(regex,deck_str)
    deck_count = Counter(cabins)
    if deck_str != "M" and len(deck_count.keys()) > 1: # DEBUG
        print(cabins)
        print(deck_count)
    for k in deck_count.keys():
        # initialized case
        if "Deck_{}".format(k) in full_data.columns:
        # assign value for the row at the particular instance
#             full_data.loc[i,"Deck_{}".format(k)] = deck_count[k]
            pass
        # uninitializated case
        else:
            pass
#             print("create new aarray")
            # create new np.array for feature
#             full_data["Deck_{}".format(k)] = np.zeros(len(full_data))
            # assign value for feature at a particular row
#             full_data.loc[i,"Deck_{}".format(k)] = deck_count[k]
        if "Deck" in full_data.columns:
            full_data.loc[i,"Deck"] = "".join(cabins)
        else:
            full_data["Deck"] = np.zeros(len(full_data))
            full_data.loc[i,"Deck"] = "".join(cabins)
            
    # create a Deck_{room} feature for each instance...? or new feature...? ie if Deck_F does not exist create it for the length
    # of the rows and tally the number occupied for the row instance..? at the particular row index...?
    # the sum of Deck_M is the sum of all miniumum cabins missing ie 1 cabin per row instance at the minimum, minimum because
    # some instances have more than 1 cabin

In [None]:
full_data[full_data["Deck"] == "BBBB"]

In [None]:
df_all_decks = full_data.groupby(['Deck','Pclass']).count().drop(
    columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin',
             'Fare', 'Embarked', 'PassengerId', 'Ticket'] + 
            [c for c in full_data.columns.to_list() if "Deck_" in c]
    ).rename(columns={'Name': 'Count'}).transpose()

df_all_decks

* How does one reduce the feature value for the Deck column from a series of chars (Ex. BBBB) to B WHILE maintaining the count of the number of cabins booked? * Does the number of cabins books matter in terms of the problem? Considering that the cabins denote the section -- "level" that passengers were assigned, passengers assigned "BBBB" would all exist on the "B" level. 
- This assumption that a passenger given a deck location resolves the issue because there is no uniqueness in terms of ship location if a passenger booked a single "B" cabin or multiple "BBBB" cabins. 
* How does one handle the multiple DIFFERENT cabin label situation? 
* Does one simply duplicate the row instance and assign a single "Deck" value for each new instance? 
* What other features should be accounted for? 
* How much does it matter that we account for multiple unique cabins pertaining to a passenger instance?
* Approach 03/04/21:
    - [ ] Run with the current configuration where unique decks are allowed even if they include duplicates OR differing labels
    - [ ] Run with the reference notebook's approach
    - [ ] Run with an approach that consolidates decks containing multiple of the same cabin label to one single label AND generate new row instances for cabins that have multiple different cabin labels (Ex. FE results in 2 instances $x_1(deck_1=F,...,feat_i^j), x_2(deck_2=E,..., feat_i^j)$)
    - [x] Run with an approach that utilizes both my approach in conjunction with the reference notebooks intuitions of consolidating the distribution of cabin labels given the SES distribution. 

In [None]:
deck_keys = list(full_data.Deck.unique())

In [None]:
def get_pclass_dist(df):
    """
    Reference: https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial
    """
    # Creating a dictionary for every passenger class count in every deck
    deck_counts = {k: {} for k in deck_keys}
    decks = df.columns.levels[0]    
    for deck in decks:
        for pclass in range(1, 4):
            try:
                count = df[deck][pclass][0]
                deck_counts[deck][pclass] = count 
            except KeyError:
                deck_counts[deck][pclass] = 0
    df_decks = pd.DataFrame(deck_counts)    
    deck_percentages = {}
    # Creating a dictionary for every passenger class percentage in every deck
    for col in df_decks.columns:
        deck_percentages[col] = [(count / df_decks[col].sum()) * 100 for count in df_decks[col]]
    return deck_counts, deck_percentages

def display_pclass_dist(percentages):
    """
    Reference: https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial
    """
    # needed to be sorted by index else the chart malfunctions
    df_percentages = pd.DataFrame(percentages).transpose().sort_index() 
#     print(df_percentages[df_percentages.index == "A"]) #DEBUG
    deck_names = sorted(full_data.Deck.unique())
#     print(deck_names) # DEBUG
    bar_count = np.arange(len(deck_names))  
    bar_width = 0.85
    
    pclass1 = df_percentages[0]
    pclass2 = df_percentages[1]
    pclass3 = df_percentages[2]
#     print(pclass1[0],pclass2[0],pclass3[0]) # DEBUG
    plt.figure(figsize=(20, 10))
    plt.bar(bar_count, pclass1, 
            color='#b5ffb9', edgecolor='white', 
            width=bar_width, label='Passenger Class 1')
    plt.bar(bar_count, pclass2, bottom=pclass1, color='#f9bc86', edgecolor='white', width=bar_width, label='Passenger Class 2')
    plt.bar(bar_count, pclass3, bottom=pclass1 + pclass2, color='#a3acff', edgecolor='white', width=bar_width, label='Passenger Class 3')

    plt.xlabel('Deck', size=15, labelpad=20)
    plt.ylabel('Passenger Class Percentage', size=15, labelpad=20)
    plt.xticks(bar_count, deck_names)    
    plt.tick_params(axis='x', labelsize=15)
    plt.tick_params(axis='y', labelsize=15)
    
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), prop={'size': 15})
    plt.title('Passenger Class Distribution in Decks', size=18, y=1.05)   
    plt.grid(axis="y")
    plt.show()    

all_deck_count, all_deck_per = get_pclass_dist(df_all_decks)
display_pclass_dist(all_deck_per)
# print(all_deck_per["A"]) # DEBUG

In [None]:
df_all_decks = full_data.groupby(['Deck','Pclass']).count().drop(
    columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin',
             'Fare', 'Embarked', 'PassengerId', 'Ticket'] + 
            [c for c in full_data.columns.to_list() if "Deck_" in c]
    ).rename(columns={'Name': 'Count'}).transpose()

df_all_decks

Upon deeper inspection, we can see that cabins A,B & C all contain 1st class passengers. Furthermore, ALL passengers even if they have multiple cabins booked remain the same. This makes sense, in that duplications in the cabin bookings do not necessarily matter much in its relation to class. Thus, instances with any permutation of A,B or C or duplicates in the string can be modified to be "ABC" representing 1st class. 


In [None]:
deck_names = sorted(full_data.Deck.unique())
deck_names

In [None]:
deck_map = {}
for d in deck_names:
    if "A" in d:
        deck_map[d] = "ABC"
    elif "B" in d:
        deck_map[d] = "ABC"
    elif "C" in d:
        deck_map[d] = "ABC"
    elif "D" in d:
        deck_map[d] = "DE"
    elif "E" in d:
        deck_map[d] = "DE"
    elif "F" in d:
        deck_map[d] = "FG"
    elif "G" in d:
        deck_map[d] = "FG"
deck_map[np.nan] = "M"
        
deck_map

In [None]:
# apply map function for replacing exisitng labels with new label group
full_data["Deck"] = full_data["Deck"].map(deck_map)
full_data.Deck = full_data.Deck.replace(np.nan, "M")
full_data.head()

In [None]:
full_data["Deck"].unique()

In [None]:
df_all_decks = full_data.groupby(['Deck','Pclass']).count().drop(
    columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin',
             'Fare', 'Embarked', 'PassengerId', 'Ticket'] + 
            [c for c in full_data.columns.to_list() if "Deck_" in c]
    ).rename(columns={'Name': 'Count'}).transpose()

df_all_decks

The reference notebook points out that it is not worth it to further imput for the missing Deck features (M) citing that those with missing Deck values have the lowest surival rate. 
* Is this claim true? 
* Is this an appropriate rationale to halt further imputation?
* What grounds this reasoning? 
    - Perhaps it's the case in which the provision of a value for the random variable "Deck" creates a defining -- a concrete characteristic in the dataset from which a trained algo is able to deduce from a given input passenger instance an output of the random target variable of survival

In [None]:
g = sns.factorplot(x="Deck",y="Survived",data=full_data,kind="bar", size = 6 , 
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")
plt.grid(axis="y")
plt.title("Survived VS {}".format("Deck"))

In [None]:
full_data['Deck'].value_counts()

In [None]:
show_missing_report(full_data)

Drop "Cabin" feature in favor of using "Deck" as its replacement

In [None]:
full_data = full_data.drop("Cabin", axis=1)

In [None]:
show_missing_report(full_data)

#### Embarked

In [None]:
# select instance in which Embarked feature is empty
full_data[full_data.Embarked.isna()]

Reference: https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html
```sh
"Mrs Stone boarded the Titanic in Southampton on 10 April 1912 and was travelling in first class with her maid Amelie Icard. She occupied cabin B-28"
```


In [None]:
# select instance in which Embarked feature is empty to fill
full_data.Embarked = full_data.Embarked.fillna("S")

In [None]:
show_missing_report(full_data)

In [None]:
med_fare = full_data.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
med_fare

In [None]:
full_data['Fare'] = full_data['Fare'].fillna(med_fare)

In [None]:
show_missing_report(full_data)
# feature imputation done!

In [None]:
# map applies a transform given the key to be transformed to the associated value
# full_data["Sex"] = full_data["Sex"].map({"male":0, "female":1}) 
# g = sns.heatmap(full_data[sig_cols].corr(),cmap="coolwarm",annot=True)
df_train = full_data[full_data["Survived"].notnull()]
train_ids = df_train["PassengerId"]
df_train = df_train.drop("PassengerId", axis=1)

df_test = full_data[full_data["Survived"].isna()]
test_ids = df_test["PassengerId"]
df_test = df_test.drop("Survived", axis=1)
df_test = df_test.drop("PassengerId", axis=1)


display(df_train.shape, df_test.shape)


In [None]:
def create_correlation_report(df, min_corr_coeff=0.1):
    """
    """
    df_corr = df.corr().abs().unstack().sort_values(
    kind="quicksort", ascending=False).reset_index()

    df_corr.rename(
        columns={"level_0": "Feature 1", "level_1": "Feature 2", 
             0: 'Correlation Coefficient'}, inplace=True)

    df_corr.drop(df_corr.iloc[1::2].index, inplace=True)

    df_corr_nd = df_corr.drop(
    df_corr[df_corr['Correlation Coefficient'] == 1.0].index)
    return df_corr_nd[df_corr['Correlation Coefficient'] > min_corr_coeff]

In [None]:
df_train_corr = df_train.corr().abs().unstack().sort_values(
    kind="quicksort", ascending=False).reset_index()

df_train_corr.rename(
    columns={"level_0": "Feature 1", "level_1": "Feature 2", 
             0: 'Correlation Coefficient'}, inplace=True)

df_train_corr.drop(df_train_corr.iloc[1::2].index, inplace=True)

df_train_corr_nd = df_train_corr.drop(
    df_train_corr[df_train_corr['Correlation Coefficient'] == 1.0].index)

df_test_corr = df_test.corr().abs().unstack().sort_values(
    kind="quicksort", ascending=False).reset_index()

df_test_corr.rename(columns={
    "level_0": "Feature 1", "level_1": "Feature 2", 
    0: 'Correlation Coefficient'}, inplace=True)

df_test_corr.drop(df_test_corr.iloc[1::2].index, inplace=True)

df_test_corr_nd = df_test_corr.drop(
    df_test_corr[df_test_corr['Correlation Coefficient'] == 1.0].index)

# Training set high correlations
corr = df_train_corr_nd['Correlation Coefficient'] > 0.1
# display(df_train_corr_nd[corr])


In [None]:
# Test set high correlations
corr = df_test_corr_nd['Correlation Coefficient'] > 0.1
# display(df_test_corr_nd[corr])


In [None]:
def plot_continuous_feats(df, features):
    """
    """
    surv = df['Survived'] == 1

    fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(20,20))
    plt.subplots_adjust(right=1.5)

    for i, feature in enumerate(features):    
        # Distribution of survival in feature
        sns.distplot(df[~surv][feature], label='Not Survived', 
                     hist=True, color='#e74c3c', ax=axs[0][i])
        sns.distplot(df[surv][feature], label='Survived', 
                     hist=True, color='#2ecc71', ax=axs[0][i])

        # Distribution of feature in dataset
        sns.distplot(df[feature], label='Training Set', hist=False, color='#e74c3c', ax=axs[1][i])
        sns.distplot(df_test[feature], label='Test Set', hist=False, color='#2ecc71', ax=axs[1][i])

        axs[0][i].set_xlabel('')
        axs[1][i].set_xlabel('')
        for j in range(2):        
            axs[i][j].tick_params(axis='x', labelsize=20)
            axs[i][j].tick_params(axis='y', labelsize=20)
            axs[i][j].grid(axis="y")

        axs[0][i].legend(loc='upper right', prop={'size': 20})
        axs[1][i].legend(loc='upper right', prop={'size': 20})
        axs[0][i].set_title('Distribution of Survival in {}'.format(feature), size=20, y=1.05)

    axs[1][0].set_title('Distribution of {} Feature'.format('Age'), size=20, y=1.05)
    axs[1][1].set_title('Distribution of {} Feature'.format('Fare'), size=20, y=1.05)
    # plt.grid(axis="y")

    plt.show()

In [None]:
def plot_categorical_feats(df, features):
    """
    df: input pandas.DataFrame
    features: columns to be displayed
    """
    fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(20, 20))
    plt.subplots_adjust(right=1.5, top=1.25)

    for i, feature in enumerate(features, 1):    
        plt.subplot(2, 3, i)
        sns.countplot(x=feature, hue='Survived', data=df)

        plt.xlabel('{}'.format(feature), size=20, labelpad=15)
        plt.ylabel('Passenger Count', size=20, labelpad=15)    
        plt.tick_params(axis='x', labelsize=20)
        plt.tick_params(axis='y', labelsize=20)

        plt.legend(['Not Survived', 'Survived'], loc='upper center', prop={'size': 18})
        plt.title('Count of Survival in {} Feature'.format(feature), size=20, y=1.05)
        plt.grid(axis="y")

    plt.show()
    

In [None]:
cont_feats = ['Age', 'Fare']
cat_feats = ['Embarked', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Deck']
plot_continuous_feats(df_train, cont_feats)
plot_categorical_feats(df_train, cat_feats)

In [None]:
# print(sig_cols) 
g = sns.heatmap(df_train.corr(),cmap="coolwarm",annot=True)
plt.title("Train Set Correlation Matrix")
plt.show()
# does not plot "Sex" because the machine does not understand strings thus need to hot encode

In [None]:
g = sns.heatmap(df_test.corr(),cmap="coolwarm",annot=True)
plt.title("Test Set Correlation Matrix")
plt.show()

In [None]:

print("Training Set Feature Correlation")
report_train = create_correlation_report(df_train)
display(len(report_train))
display(report_train)

print("Testing Set Feature Correlation")
report_test = create_correlation_report(df_test)
display(len(report_test))
display(report_test)

## EDA: Post-Completion Review
* Reference: https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial/output#2.-Feature-Engineering
* Features of the highest correlation: 
    * Pclass VS Fare
        - ~$0.556 = corr_{train}(x_{Pclass}, x_{Fare})$
        - ~$0.577 = corr_{test}(x_{Pclass}, x_{Fare})$
    * Age vs Pclass
        - ~$0.546 = corr_{train}(x_{Sex}, x_{Survived})$
        - ~$0.521 = corr_{test}(x_{Pclass}, x_{Age})$
    * All the other features are highly correlated with a factor greater than 0.1 
        - 15 correlations for the training set
        - 10 correlations for the testing set
* Continuos Features:
    * Age and Fare make for good features in which there exist "good split points and spikes" for a decision tree to learn
        - What makes a decision tree appropriate for this type of problem? 
        - How does a decision tree handle priors depicted in Age and Fare?
        - What are the assumptions when using a decision tree?
        - Why are the spikes easily capture by a decision tree? Is it because a tree by its nature resistant to non-Gaussian distributions?
* Categorical Features
    * Passengers who embarked from Southampton have the lowest chance of survival
    * Passengers who embarked from Cherbourg have the highest survival rate
    * The trend of passenger survival rate and its relation to port of departure is likely related to Pclass (SES)
    * Women were more likley to survive in contrast to men
    * Passengers who had one or more family members -- particularly less than or equal 2 and greater than 0 had a higher rate of survival when observing SibSp and Parch

* Conclusions
    * The categorical and continuos features are highly correlated with the target feature
    * Categorical features may be used to generate new features 
    

## Feature Engineering (FE): Revisited 
This section utilizes exterior notebooks as form of guidance towards learning about the various methods of creating new features from existing features.

### Binning/Discretization
Binning of continuos variables results in new categorical features. Binning introduces <b>non-linearity</b> which may improve the performance of a model. In addition, it can be used for identifying missing values or outliers. More info on binning: https://towardsdatascience.com/feature-engineering-deep-dive-into-encoding-and-binning-techniques-5618d55a6b38

In [None]:
g = sns.distplot(full_data["Fare"], color="m", label="Skewness : %.2f"%(full_data["Fare"].skew()))
g = g.legend(loc="best")
plt.title("{} Distribution".format("Fare"))
plt.show()

In [None]:
# df_all['Fare'] = pd.qcut(df_all['Fare'], 13)

In [None]:
# fig, axs = plt.subplots(figsize=(22, 9))
# sns.countplot(x='Fare', hue='Survived', data=df_all)

# plt.xlabel('Fare', size=15, labelpad=20)
# plt.ylabel('Passenger Count', size=15, labelpad=20)
# plt.tick_params(axis='x', labelsize=10)
# plt.tick_params(axis='y', labelsize=15)

# plt.legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 15})
# plt.title('Count of Survival in {} Feature'.format('Fare'), size=15, y=1.05)

# plt.show()


In [None]:
# df_all['Age'] = pd.qcut(df_all['Age'], 10)

In [None]:
# fig, axs = plt.subplots(figsize=(22, 9))
# sns.countplot(x='Age', hue='Survived', data=df_all)

# plt.xlabel('Age', size=15, labelpad=20)
# plt.ylabel('Passenger Count', size=15, labelpad=20)
# plt.tick_params(axis='x', labelsize=15)
# plt.tick_params(axis='y', labelsize=15)

# plt.legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 15})
# plt.title('Survival Counts in {} Feature'.format('Age'), size=15, y=1.05)

# plt.show()

* Fare:
    - The Fare feature holds an extremely large positive skew. 
    - The referenced notebook applies a quantile binning to generate 13 new categorical features from the Fare feature. It considers 13 new bins to be too much -- why is it too much? What is a more appropriate quantity of bins? 
    - How does pandas.qcut operate in its generation of the specified number of bins?
    - How does the following differ from each other?
```sh
pandas.cut(...)
pandas.qcut(...)
```
    
* Age:
    - The feature appears to be distributed according to a Gaussian
    - Why was 10 specified for the number of bins to be created? How did the author arrive at the value?
    - Why do we not normalize this feature? It would make sense to NOT normalize because during feature engineering we'd be able to create new features -- bins for potentially making it explicit that instances pertaining to a particular bin such as a bin pertaining to a large spike in the distribution plot share a relation to the target feature. If one preemptively normalized the feature, one would not be able to gain further insight from the random feature variable.

### Frequency Encoding
* How does frequency encoding differ from binning/discretization? 
* Some references define frequency encoding to be the conversion of categorical features into frequencies. Are these new frequency features continuos? In constrast, there are also places defining frequency encoding to be carried out with consideration of the frequency distribution.
    - https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02
    - https://python-data-science.readthedocs.io/en/latest/preprocess.html
    - DataCamp Encoding Methodologies: https://www.datacamp.com/community/tutorials/encoding-methodologies
    > "It (frequency encoding) is a way to utilize the frequency of the categories as labels. In the cases where the frequency is related somewhat with the target variable, it helps the model to understand and assign the weight in direct and inverse proportion, depending on the nature of the data."


* What are the different types of categorical features?
    - Reference: https://towardsdatascience.com/understanding-feature-engineering-part-2-categorical-data-f54324193e63
    - Nominal
    - Ordinal
    - How does nominal differ from ordinal?
We can create a new feature "Family_Size" by using SibSp and Parch. Recall that SibSp denotes the number of siblings and spouses entow with a passenger instance. In addition, Parch denotes the number of parents and children pertaining to the passenger. We can represent "Family_Size" as follows:<br>
$size_{family} = SibSp + Parch + 1$<br>
We include 1 to acccount for the passenger instance's self. 
* The reason for creating this new feature is the correlation between family size and survival rate observed between SipSp VS Survived and Parch VS Survived
* After creating the new continuos feature, we can take it a step further by creating bins
* Care needs to given here when defining the new categorical features because we want to create a seperation between different group sizes 

In [None]:
full_data["FamilySize"] = full_data.SibSp + full_data.Parch + 1
full_data.head()

In [None]:

# fig, axs = plt.subplots(figsize=(20, 20), ncols=2, nrows=2)
# plt.subplots_adjust(right=1.5)

# sns.barplot(x=df_all['Family_Size'].value_counts().index, y=df_all['Family_Size'].value_counts().values, ax=axs[0][0])
# sns.countplot(x='Family_Size', hue='Survived', data=df_all, ax=axs[0][1])

# axs[0][0].set_title('Family Size Feature Value Counts', size=20, y=1.05)
# axs[0][1].set_title('Survival Counts in Family Size ', size=20, y=1.05)

# family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
# df_all['Family_Size_Grouped'] = df_all['Family_Size'].map(family_map)

# sns.barplot(x=df_all['Family_Size_Grouped'].value_counts().index, y=df_all['Family_Size_Grouped'].value_counts().values, ax=axs[1][0])
# sns.countplot(x='Family_Size_Grouped', hue='Survived', data=df_all, ax=axs[1][1])

# axs[1][0].set_title('Family Size Feature Value Counts After Grouping', size=20, y=1.05)
# axs[1][1].set_title('Survival Counts in Family Size After Grouping', size=20, y=1.05)

# for i in range(2):
#     axs[i][1].legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 20})
#     for j in range(2):
#         axs[i][j].tick_params(axis='x', labelsize=20)
#         axs[i][j].tick_params(axis='y', labelsize=20)
#         axs[i][j].set_xlabel('')
#         axs[i][j].set_ylabel('')

# plt.show()


### Grouping by Label Frequency
* There exists a large number of unique ticketIDs
* Grouping of tickets by their frequencies
* <b>How is this feature any different than FamilySize?</b>
    - Reference: https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial/output#2.-Feature-Engineering
    - Claim/Assumption/Explanation: 
    > "Many passengers travelled along with groups. Those groups consist of friends, nannies, maids and etc. They weren't counted as family, but they used the same ticket."
    - My Thoughts + Concerns:
    > Insert Stuff

* <b>Why not group tickets by their prefixes?</b>
    - Reference: https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial/output#2.-Feature-Engineering
    - Claim/Assumption/Explanation:
    > "If prefixes in Ticket feature has any meaning, then they are already captured in Pclass or Embarked features because that could be the only logical information which can be derived from the Ticket feature. According to the graph below, groups with 2,3 and 4 members had a higher survival rate. Passengers who travel alone has the lowest survival rate. After 4 group members, survival rate decreases drastically. This pattern is very similar to Family_Size feature but there are minor differences. Ticket_Frequency values are not grouped like Family_Size because that would basically create the same feature with perfect correlation. This kind of feature wouldn't provide any additional information gain."
    - My Thoughts + Concerns:
    > Insert Stuff
* A Deeper Dive into the Title Feature: https://www.kaggle.com/pliptor/titanic-ticket-only-study/notebook
    

In [None]:
len(full_data.Ticket.unique())
len(full_data.Ticket)

In [None]:

# full_data.groupby('Ticket')["Ticket"].transform()

In [None]:
# create a feature that denotes the frequency for each unique ticket in the set of all ticketIDs
full_data["TicketFrequency"] = full_data.groupby('Ticket')['Ticket'].transform('count')
full_data.head()

In [None]:
fig, axs = plt.subplots(figsize=(12, 9))
sns.countplot(x='TicketFrequency', hue='Survived', data=full_data)

plt.xlabel('Ticket Frequency', size=15, labelpad=20)
plt.ylabel('Passenger Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 15})
plt.title('Count of Survival in {} Feature'.format('Ticket Frequency'), size=15, y=1.05)

plt.show()

### New "Title" Feature

Observing the Name feature, we see that there exists a Title -- a label/classification of the persons social status. Again when referring to the notebook: https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial/output#2.-Feature-Engineering there are claims to "correct" the labels. <br>
<br>
From the notebook of reference,
> "Title is created by extracting the prefix before Name feature. According to graph below, there are many titles that are occuring very few times. Some of those titles doesn't seem correct and they need to be replaced. Miss, Mrs, Ms, Mlle, Lady, Mme, the Countess, Dona titles are replaced with Miss/Mrs/Ms because all of them are female. Values like Mlle, Mme and Dona are actually the name of the passengers, but they are classified as titles because Name feature is split by comma. Dr, Col, Major, Jonkheer, Capt, Sir, Don and Rev titles are replaced with Dr/Military/Noble/Clergy because those passengers have similar characteristics. Master is a unique title. It is given to male passengers below age 26. They have the highest survival rate among all males.<br><br>
Is_Married is a binary feature based on the Mrs title. Mrs title has the highest survival rate among other female titles. This title needs to be a feature because all female titles are grouped with each other."

Naturally, the questions that come to mind include:
- What defines correctness in this particular context?
- Why are these Titles considered to be incorrect by the referenced notebook?
- What is overlooked by the referenced notebook?
    - The replacement of the gender specific labels (Ex.'Ms') with Miss/Mrs/Ms makes sense
    - Concerns are in regard to the IsMarried portion which from further inspection appears to be correct due to the order in which the IsMarried feature was created with respect to changes applied to the Title feature ie the referecned notebok was correct in deliberately defining the marriage status for all women with honorific title 'Mrs'. 
    - The case for men being labelled married does not exist. Perhaps this is something that could be further investigated? 
    - The case of alternative honorifics such as Mme, Lady, Mlle were given a blanket statement of being replaced prior to accounting for their implication of marriage status.
        - Countess, Lady?, Dona (Senora)
            - Reference: https://en.wikipedia.org/wiki/English_honorifics
            - Lady: for female peers with the rank of baroness, viscountess, countess, and marchioness, or the wives of men who hold the equivalent titles. By courtesy the title is often also used for wives of Knights and Baronets. 
            - Implication of marriage status
            - Despite the research, these labels appear to be unreiliable in the implication of a woman's marriage status. They appear to be more reliable for implying class status. Therefore, it is perhaps correct to operate using just the English differentiation of Mrs vs Ms for interpretting marriage statuses. Although it is incomplete, it serves as another layer of detail for differentiating between the women who survive.
        - Mlle (Mademoiselle) ie unmarried woman
        

In [None]:
full_data.Name.head(10)

In [None]:
full_data['Title'] = full_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
full_data.head()

In [None]:
full_data.Title.unique()

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))
sns.barplot(x=full_data['Title'].value_counts().index, y=full_data['Title'].value_counts().values, ax=axs[0])

axs[0].tick_params(axis='x', labelsize=10)
axs[1].tick_params(axis='x', labelsize=15)

for i in range(2):    
    axs[i].tick_params(axis='y', labelsize=15)

axs[0].set_title('Title Feature Value Counts', size=20, y=1.05)

full_data['Title'] = full_data['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
full_data['Title'] = full_data['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

sns.barplot(x=full_data['Title'].value_counts().index, y=full_data['Title'].value_counts().values, ax=axs[1])
axs[1].set_title('Title Feature Value Counts After Grouping', size=20, y=1.05)

plt.show()

In [None]:
# def extract_surname(data):    
    
#     families = []
    
#     for i in range(len(data)):        
#         name = data.iloc[i]

#         if '(' in name:
#             name_no_bracket = name.split('(')[0] 
#         else:
#             name_no_bracket = name
            
#         family = name_no_bracket.split(',')[0]
#         title = name_no_bracket.split(',')[1].strip().split(' ')[0]
        
#         for c in string.punctuation:
#             family = family.replace(c, '').strip()
            
#         families.append(family)
            
#     return families

# full_data['Family'] = extract_surname(full_data['Name'])
# df_train = full_data.loc[:890]
# df_test = full_data.loc[891:]
# dfs = [df_train, df_test]

# Analysis of Models

## Data Pre-Imputation and Feature Engineering

In [None]:
from pycaret import classification

### Data Pipeline Setup

In [None]:
print("Data Shape")
print("Test:",test,"Train:", train)

### Overview of Models

In [None]:
classification_setup = classification.setup(data=train, target="Survived")

### Model Benchmarks

In [None]:
baseline = classification.compare_models()
# print(type(baseline))
# baseline

In [None]:
# create the xgb model
classification_xgb = classification.create_model('xgboost')
# # tune xgb model
tune_xgb = classification.tune_model(classification_xgb)


In [None]:
# build the lightgbm model
classification_lightgbm = classification.create_model('lightgbm')
# Tune lightgbm model
tune_lightgbm = classification.tune_model(classification_lightgbm)


In [None]:
# Residual Plot
classification.plot_model(tune_lightgbm)


In [None]:
# Error Plot
classification.plot_model(tune_lightgbm, plot = 'error')


In [None]:
# Feature Important plot
classification.plot_model(tune_lightgbm, plot='feature')


In [None]:
# Evaluate model
classification.evaluate_model(tune_lightgbm)
# read the test data
# test_data_classification = test
# make predictions
predictions = classification.predict_model(tune_xgb, data=test)
# view the predictions
predictions


## Data Post-Imputation and Feature Engineering

### Data Pipeline Setup

In [None]:
print("Data Shape")
print("Test:",test_shape,"Train:", train_shape)

In [None]:
df_train = full_data.loc[:(train_shape[0]-1), :]
df_test = full_data.loc[train_shape[0]:, :]
# print(df_train)
# print(df_test)
# print(df_train.info())
# print(df_train.describe())
# show_missing_report(df_train)
# df_test.head()


In [None]:
# df_test.info()

### Overview of Models

In [None]:
classification_setup = classification.setup(data=train, target="Survived")

### Model Benchmarks

In [None]:
classification.compare_models()


In [None]:
# create the xgb model
classification_xgb = classification.create_model('xgboost')
# # tune xgb model
tune_xgb = classification.tune_model(classification_xgb)


In [None]:
# build the lightgbm model
classification_lightgbm = classification.create_model('lightgbm')
# Tune lightgbm model
tune_lightgbm = classification.tune_model(classification_lightgbm)


In [None]:
# Residual Plot
classification.plot_model(tune_lightgbm)


In [None]:
# Error Plot
classification.plot_model(tune_lightgbm, plot = 'error')


In [None]:
# Feature Important plot
classification.plot_model(tune_lightgbm, plot='feature')


In [None]:
# Evaluate model
classification.evaluate_model(tune_lightgbm)
# read the test data
# test_data_classification = test
# make predictions
predictions = classification.predict_model(tune_xgb, data=test)
# view the predictions
predictions
