[TITANIC - Neural Networks for Beginners](https://www.kaggle.com/duttasd28/titanic-neural-networks-for-beginners)

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
cd "/content/gdrive/My Drive/Colab Notebooks/Deep_Learning_study/Kaggle/1_Titanic"

## Importing the data

In [None]:
import pandas as pd

In [None]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
PassengerId = test_data.PassengerId
print(PassengerId)

# dataframe[Column], dataframe.column 머가 다르지

In [None]:
train_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], inplace=True, axis=1)
test_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], inplace=True, axis=1)


#### DataFrame.drop()

- 특정한 columns이나 rows를 뺀다
- 사용법: `DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')`
- labels: single label or list-like. 뺄 column labels나 Index
- axis: {0 or 'index', 1 or 'columns'}, default 0. index를 뺄건지, columns를 뺄건지
- index: single label or list-like. `labels, axis=0` 은 `index=labels` 와 같다
- columns: single label or list-like. `labels, axis=1` 은 `columns=labels` 와 같다
- inplace: bool, default False. False면 복사본을 반환하고, True면 dataframe 안에서 실행한다
- errors: {'ignore', 'raise'}, default ‘raise. ignore이면 에러를 무시하고, 존재하는 labels 만 뺀다

## Null Value Management

In [None]:
print(train_data.isnull().any())

#### DataFrame.isnull()
- 없는 값을 찾는다
- boolean 값을 같은 크기의 dataframe으로 반환한다.
- DataFrame.isnull: Alias of isna.
- DataFrame.notna: Boolean inverse of isna.
- DataFrame.dropna: Omit axes labels with missing values.
- isna: Top-level isna.

#### DataFrame.any()
- 어떤 element 라도 True 인지 반환합니다.
- 사용법: `DataFrame.any(axis=0, bool_only=None, skipna=True, level=None, **kwargs)`
- axis: {0 or 'index', 1 or 'columns', None}, default 0. 0이면 column 단위로, 1이면 index 단위로, None 이면 스칼라로 반환
- bool_only: bool, default None. boolean columns만 포함합니다. None이면 모든것을 사용하도록 시도하고, 그 후 boolean data만 사용합니다.
- skipna: bool, default True. NA/null 값을 배제합니다. 
- level: int or level name, default None.
- numpy.any, Series.any, Series.all, DataFrame.any, DataFrame.all

In [None]:
print(test_data.isnull().any())

## Split into independent and dependent features

In [None]:
y = train_data.Survived

X = train_data.drop(['Survived'], axis=1)

print(y.head())
print(X.head())

## Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### 1. BarPlot

In [None]:
sns.barplot(x=y.unique(), y=y.value_counts());

### 2. PairPlot
- 사용법: `seaborn.pairplot(data, hue=None, hue_order=None, palette=None, vars=None, x_vars=None, y_vars=None, kind='scatter', diag_kind='auto', markers=None, height=2.5, aspect=1, corner=False, dropna=True, plot_kws=None, diag_kws=None, grid_kws=None, size=None)`

In [None]:
sns.pairplot(data=train_data, corner=True, palette='summer');

## Training Phase

In [None]:
X_train, X_test, y_train, y_test = X, test_data, y, None

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)

#### DataFrame.reset_index
- index를 리셋
- 사용법: `DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='')`
- level: int, str, tuple, or list, default None


In [None]:
s = X_train.dtypes=='object'
categorical_cols = list(s[s].index)

numerical_cols = [i for i in X_train.columns if not i in categorical_cols]
print(numerical_cols)

## Filling NULL Values with KNN

1. KNN Imputer imputes(fills null values) by using KNN.
2. It takes k nearest data points of the point with missing values and fill the missing value in.

In [None]:
from sklearn.impute import KNNImputer

nm_imputer = KNNImputer()

X_train_numerical = pd.DataFrame(nm_imputer.fit_transform(X_train[numerical_cols]),
                                 columns=numerical_cols)

X_test_numerical = pd.DataFrame(nm_imputer.transform(X_test[numerical_cols]),
                                columns=numerical_cols)

### KNNImputer
- k-Nearest Neighbors 를 이용하여 없는 값 채우기
- fit_transform(X, y=None, **fit_params): imputer를 데이터에 맞추고, 변환하기
- transform(X): 모든 없는 값 채우기

In [None]:
X_train = X_train.drop(numerical_cols, axis=1)
X_test = X_test.drop(numerical_cols, axis=1)

X_train = X_train.join(X_train_numerical)
X_test = X_test.join(X_test_numerical)

print(X_train.isnull().any())

In [None]:
X_train.dtypes

### Simple Imputer
Simple imputer imputes values with the values it is provided

In [None]:
from sklearn.impute import SimpleImputer

nm_imputer = SimpleImputer(strategy='most_frequent')

X_train_categorical = pd.DataFrame(nm_imputer.fit_transform(X_train[categorical_cols]),
                                 columns = categorical_cols)
X_test_categorical = pd.DataFrame(nm_imputer.transform(X_test[categorical_cols]),
                                 columns = categorical_cols)

In [None]:
X_train = X_train.drop(categorical_cols, axis=1)
X_test = X_test.drop(categorical_cols, axis=1)

X_train = X_train.join(X_train_categorical)
X_test = X_test.join(X_test_categorical)

print(X_train.isnull().any())

In [None]:
print(X_train.head())

## One Hot Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[categorical_cols]))

OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

num_X_train = X_train.drop(categorical_cols, axis=1)
num_X_test = X_test.drop(categorical_cols, axis=1)

X_train = num_X_train.join(OH_cols_train, how='left')
X_test = num_X_test.join(OH_cols_test, how='left')

print(X_train.head())

## Neural Network

In [None]:
from sklearn.model_selection import train_test_split

X_train_2, X_val, y_train_2, y_val = train_test_split(X_train, y_train, test_size=0.2,
                                                      random_state=10)

### train_test_split
- 행렬을 임의로 찢는다
- train_test_split(arrays, options)
- arrays
- test_size: float or int, default=None.
- train_size
- random_state: int or RandomState instance, default=None.
- shuffle: bool, default=True
- stratify: array-like, default=None

## Keras

In [None]:
from tensorflow import keras
from keras import Sequential
from keras.layers import Dropout, Dense

### 1. Create Model

In [None]:
model = Sequential()

model.add(Dense(128, activation='relu', input_shape = (10,)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(16, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

### 2. Compile It

In [None]:
model.compile(optimizer='adam',
              loss=keras.losses.BinaryCrossentropy(),
              metrics = [keras.metrics.AUC()])

In [None]:
print(X_val)

## Initial Test to see how well the model is performing

In [None]:
history = model.fit(
    X_train_2,
    y_train_2,
    batch_size=8,
    epochs=25,
    validation_data=(X_val, y_val)
)

## Train with all

In [None]:
model.compile(optimizer='adam',
              loss=keras.losses.BinaryCrossentropy(),
              metrics = [keras.metrics.AUC()])

history = model.fit(
    X_train,
    y_train,
    batch_size=4,
    epochs=32
)

## Generate Predictions

In [None]:
y_preds = model.predict_classes(X_test)

In [None]:
y_preds

## Generating submission File

In [None]:
import os

In [None]:
file_name = "MyTitanicSubmission.csv"

y_pred_series = pd.Series(y_preds.flatten(), name='Survived')

file = pd.concat([PassengerId, y_pred_series], axis=1)

file.to_csv(file_name, index=False)