In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Check python system version
import sys
sys.version

# Table of Contents
1. Overview
1. System Setup
1. Exploratory Data Analysis
1. Preprocessing
1. Training Models
1. Evaluate -> Tune -> Ensemble
1. Conclusion

# 3. Exploratory Data Analysis
Credits: [Ken Jee](https://www.kaggle.com/kenjee/titanic-project-example), [Emanuele Panizio](https://www.kaggle.com/emanuelepanizio/pytanicnb/notebook), [Ju Liu](https://www.youtube.com/watch?v=fS70iptz-XU&t=2569s])

References: [Data](https://www.kaggle.com/c/titanic/data)

Functions to try out:
* create subplots 
* seaborn and plt plots
* sns.barplot
* sns.countplot
* sns.catplot
* sns.displot
* pivot_table
* plot fares as line and bins

In [None]:
# load the training and test data

train = pd.read_csv('/kaggle/input/titanic/train.csv', index_col='PassengerId')
test = pd.read_csv('/kaggle/input/titanic/test.csv', index_col='PassengerId')

In [None]:
# inspect the dataframe
train.head(10)

In [None]:
# inspect the dataframe for entries, columns, missing values, and data types

train.info()
print('')
test.info()

In [None]:
# Statistical values for all the numerical categories

train.describe()

In [None]:
train.describe().columns

In [None]:
df_num = train[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
df_cat = train.drop(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], axis=1)
df_cat

In [None]:
# Gender breakdown
print(train['Sex'].value_counts())
print()
print(train['Sex'].value_counts(normalize=True)) #Percentage breakdown

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(15,10))
sns.barplot(x='Sex', y='Survived', data=train, ax=ax[0,0])
sns.barplot(x='Embarked', y='Survived', data=train, ax=ax[0,1])
sns.barplot(x='Pclass', y='Survived', data=train, ax=ax[0,2])
sns.barplot(x='SibSp', y='Survived', data=train, ax=ax[1,0])
sns.barplot(x='Parch', y='Survived', data=train, ax=ax[1,1])
# sns.barplot(x='SibSp', y='Survived', data=train, ax=ax[1,2])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,4))
sns.countplot(x='Sex', data=train, ax=ax[0])
sns.countplot(x='Pclass', data=train, ax=ax[1])

# 4. Preprocessing Data
* Test a quick and dirty data set
* Pull out the Mr/Mrs titles for data processing
* Categorial data must me turned into numerical with [One Hot Encoding or Get Dummies](https://towardsdatascience.com/what-is-one-hot-encoding-and-how-to-use-pandas-get-dummies-function-922eb9bd4970)
    * 'Pclass' - use `integer encoding` or ORDINAL VALUES
    * 2 catagroical variables, get dummies k-1 example: `pd.get_dummies(df.Sex, drop_first=True)`
    * 3 or more categorical vars, `pd.get_dummies(df.Embarked, prefix='Embarked')`
    * Multiple columns `df = pd.get_dummies(df, columns=['Sex', 'Embarked'])`
    * Join the encoded features to df with concat(): `df = pd.concat([df, embarked_dummies], axis=1)`
* Create function called `wrangle()` to do all the pre-processing to `return X_train, X_test, y_train`
* Scaling vs Normalization - [article1](https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/), [article2](https://stackoverflow.com/questions/51237635/difference-between-standard-scaler-and-minmaxscaler)
* Must scale or normalize: `['Fare', 'Age']`
* Impute Age - fillna()
```
#impute nulls for continuous data 
#all_data.Age = all_data.Age.fillna(training.Age.mean())
all_data.Age = all_data.Age.fillna(training.Age.median())
#all_data.Fare = all_data.Fare.fillna(training.Fare.mean())
all_data.Fare = all_data.Fare.fillna(training.Fare.median())
```

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()

In [None]:
train_df_2 = pd.get_dummies(train, columns=['Sex', 'Embarked'])
X_train = train_df_2.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'])
X_train.Age = X_train.Age.fillna(train.Age.median())
print(train.Age.median())


test_df_2 = pd.get_dummies(test, columns=['Sex', 'Embarked'])
X_test = test_df_2.drop(columns=['Name', 'Ticket', 'Cabin'])
X_test.Age = X_test.Age.fillna(test.Age.median())
X_test.Fare = X_test.Fare.fillna(test.Fare.mean())
print(X_train.shape)
print(X_test.shape)
X_train.head()

In [None]:
y_train = train['Survived']
y_train.shape

In [None]:
X_test.info()

# 5. Train Model
* Decision Tree
* Try graphing this
``` 
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris") 
```

## Decision Tree Classifier

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

print(f'Train data score: {clf.score(X_train, y_train)}')
# print(f'Test data score: {clf.score(X_test, y_test)}')

In [None]:
y_hat = clf.predict(X_test)
y_hat
# now you must concat this to results df and submit

In [None]:
# prepare submission
submission_tree = pd.DataFrame({'PassengerID': X_test.index, 'Survived': y_hat})
submission_tree.to_csv('my_submission.csv', index=False)
submission_tree