# Explore feature-to-feature relationship in Titanic

In [4]:
import pandas as pd
import seaborn as sns
from sklearn import datasets
from discover_feature_relationships import discover
import matplotlib.pyplot as plt

In [5]:
# watermark is optional - it shows the versions of installed libraries
# so it is useful to confirm your library versions when you submit bug reports to projects
# install watermark using
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
# show a watermark for this environment
%watermark -d -m -v -p numpy,matplotlib,sklearn -g

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
2019-01-10 

CPython 3.6.6
IPython 7.2.0

numpy 1.15.4
matplotlib 3.0.2
sklearn 0.20.2

compiler   : GCC 4.8.2 20140120 (Red Hat 4.8.2-15)
system     : Linux
release    : 4.19.8-041908-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   : a2d07909b67f4845f1861dc6e0a83c7e80bb32eb


# Load the Titanic dataset

In [7]:
df = pd.read_csv("titanic_train.csv")
df['SibSpParch'] = df.SibSp + df.Parch

cols = ['Pclass', 'SibSp', 'Parch', 'SibSpParch', 'Age', 'Fare', 'Sex', 'Embarked', 'Survived']
classifier_overrides = set(['Embarked', 'Survived', 'Sex'])

df.head()

FileNotFoundError: File b'titanic_train.csv' does not exist

# Discover non-linear relationships

_Github note_ colours for `style` don't show up in Github, you'll have to grab a local copy of the Notebook.

* Fare predicts Embarked but Embarked poorly predict Fare
* Fare predicts Pclass but Pclass poorly predicts Fare
* SibSp and Parch both predict SibSpParch (the sum of both), SibSpParch predicts both of its components
* Sex (a text columns) predicts Survived

In [None]:
# pass in the subset of columns that we're interested in (we could analyse all columns if we wanted a bigger output)
# note that the data is sampled (fraction==1) to get a random shuffle
df_results = discover.discover(df[cols].sample(frac=1), classifier_overrides)

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(df_results.pivot(index='target', columns='feature', values='score').fillna(1),
            annot=True, center=0, ax=ax, vmin=-0.1, vmax=1, cmap="viridis");

In [None]:
#df_results.pivot(index='target', columns='feature', values='score').fillna(1) \
#.style.background_gradient(cmap="viridis", low=0.3, high=0.0, axis=1) \
#.set_precision(2)

## Check 2D relationships using a scatter plot

In [None]:
df.plot(kind="scatter", x="Fare", y="SibSpParch", alpha=0.1);

### Add a regression plot to show the direction of the relationship

In [None]:
sns.regplot(data=df, x="Fare", y="SibSpParch");

## Scatter for Pclass (a categorical) is more sensibly represented using a box plot

In [None]:
df.plot(kind="scatter", x="Fare", y="Pclass", alpha=0.1);

In [None]:
sns.boxplot(data=df, x="Fare", y="Pclass", orient="h", order=[3, 2, 1]);
sns.stripplot(data=df, x="Fare", y="Pclass", orient="h", order=[3, 2, 1], jitter=True, alpha=0.5);

## Plot Fare vs Embarked

In [None]:
sns.boxplot(data=df, x="Fare", y="Embarked", orient="h");
sns.stripplot(data=df, x="Fare", y="Embarked", orient="h", jitter=True, alpha=0.5);