### Import packages.

In [None]:
import pandas as pd

from src.eda import eda_utils
from src.features import engineer_feats

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Read in data.

In [None]:
df_trn = pd.read_csv('../data/raw/train.csv')
df_tst = pd.read_csv('../data/raw/test.csv')

print('Number of trn samples: {}'.format(len(df_trn)))
print('Number of tst samples: {}'.format(len(df_tst)))

## 1. Exploratory Data Analysis
| *Variable*    | *Definition*          | *Key*                   | *Data Type* |
| -----------   | --------------------- | ----------------------- | ----------- |
| PassengerId   | Unique identifier     |                         | `int`       |
| Pclass        | Ticket class          | 1: 1st, 2:2nd, 3:3rd    | `int`       |
| Name          | Name                  |                         | `str`       |
| Sex           | Sex                   |                         | `str`       |
| Age           | Age in years          |                         | `float`     |
| SibSp         | # siblings / spouses  |                         | `int`       |
| Parch         | # parents / children  |                         | `int`       |
| Ticket        | Ticket number	        |                         | `int`       |
| Fare          | Passenger fare	    |                         | `float`     |
| Cabin         | Cabin number	        |                         | `str`       |
| Embarked      | Port of Embarkation   | C, Q, S: 3 unique ports | `str`       |
| Survived      | Survival              | 0 = No, 1 = Yes         | `bin`       |

In [None]:
df_trn.info(null_counts=True)

In [None]:
df_trn.describe()

In [None]:

# If the profile does not render here in the notebook,
# the HTML report is saved in ~/reports/.
"""
eda_utils.pandas_profile(df_trn,
                         title="Profiling Titanic Dataset",
                         output_file="../reports/profile_titanic.html")
"""

In [None]:
"""
eda_utils.generate_pps(df_trn,
                       target="Survived")
"""

In [None]:
"""
eda_utils.autovisualize(df_trn, target="Survived")
"""

## 2. Feature Engineering

#### Data type conversion.
Replace string feature values with numeric values.

In [None]:
df_trn = engineer_feats.str_to_numeric(df_trn)
df_tst = engineer_feats.str_to_numeric(df_tst)

#### Data Imputation: Age (both trn and tst)

In [None]:
df_trn, df_tst = engineer_feats.impute_age(df_trn, df_tst)

#### Data Imputation: Cabin (both trn and tst)

Cabin is quite highly correlated with `Survived`, so we cannot drop it despite the high frequency of missing values. We will impute the missing values after finding the most significantly correlated feature(s).

In [None]:
df_trn, df_tst = engineer_feats.impute_cabin(df_trn, df_tst)

#### Data Imputation: Embarked (trn)

Only 2 missing in `df_trn`. None missing in `df_tst`.

In [None]:
df_trn = engineer_feats.impute_embarked(df_trn)

#### Data Imputation: Fare (tst)
Only df_tst has missing value(s).    
df_trn has been passed in because all statistics must be computed on (only) the training distribution.

In [None]:
df_tst = engineer_feats.impute_fare(df_trn, df_tst)

## 3. Manifold Visualization

### 3.1 PCA

In [None]:
eda_utils.pca(df_trn)

### 3.2 t-SNE

In [None]:
eda_utils.tsne(df_trn)

## 4. Learning Baselines 

``Break this out into a preprocessing function.``

In [None]:
from sklearn.model_selection import train_test_split

# Drop "Name" and "Ticket".
df_trn = df_trn.drop(["Name", "Ticket"], axis=1)
df_tst = df_tst.drop(["Name", "Ticket"], axis=1)

trn_X, trn_Y = df_trn.drop(["Survived"], axis=1), df_trn["Survived"]

trn_X, val_X, trn_Y, val_Y = train_test_split(trn_X, trn_Y,
                                              test_size=0.20,
                                              random_state=42)

trn_X, trn_Y = trn_X.to_numpy(), trn_Y.to_numpy()
val_X, val_Y = val_X.to_numpy(), val_Y.to_numpy()

``Break this out into a baseline function.``

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(trn_X, val_X, trn_Y, val_Y)

print(models)