In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
# ipython magic function
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


### Acquire data

In [None]:
# acquire data and get pandas dataframe
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
combine = [train_df, test_df]

### Analyze by describing data

In [None]:
# analyze by describing data
print(train_df.columns.values)

In [None]:
# preview the data
#train_df.head(n=2)
train_df.head()

In [None]:
train_df.tail()

In [None]:
# pandas dataframe summary
train_df.info()
print('_' * 40)
test_df.info()

In [None]:
train_df.describe()
# Review survived rate using `percentiles=[.61, .62]` knowing our problem description mentions 38% survival rate.
# Review Parch distribution using `percentiles=[.75, .8]`
# SibSp distribution `[.68, .69]`
# Age and Fare `[.1, .2, .3, .4, .5, .6, .7, .8, .9, .99]`

Pandas [describe function documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html).

In [None]:
train_df.describe(include=['O'])

In [None]:
# include only string columns
train_df.describe(include=[np.object])

In [None]:
# include only numeric columns
train_df.describe(include=[np.number])

### Analyze by pivoting features

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
#train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

### Analyze by visualizing data

In [None]:
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)

In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

In [None]:
# grid = sns.FacetGrid(train_df, col='Embarked')
grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

In [None]:
# grid = sns.FacetGrid(train_df, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'})
grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

### Wrangle data

In [None]:
# correcting by dropping features
print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

# drop Ticket and Cabin 
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

print("After",  train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

### Feature engineering

Pandas [extract function documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.extract.html).

Pandas [cross-tabulation function documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.crosstab.html).

In [None]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

In [None]:
for dataset in combine:
    dataset['NewTitle'] = dataset['Title']
    dataset['NewTitle'] = dataset['NewTitle'].replace(['Lady', 'Countess','Capt', 'Col',\
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['NewTitle'] = dataset['NewTitle'].replace('Mlle', 'Miss')
    dataset['NewTitle'] = dataset['NewTitle'].replace('Ms', 'Miss')
    dataset['NewTitle'] = dataset['NewTitle'].replace('Mme', 'Mrs')
   
train_df[['NewTitle', 'Survived']].groupby(['NewTitle'], as_index=False).mean()


In [None]:
# convert categorical titles to ordinal
# like factor in R?
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
for dataset in combine:
    dataset['NewTitle'] = dataset['NewTitle'].map(title_mapping).astype(int)
    dataset['NewTitle'] = dataset['NewTitle'].fillna(0)
    
train_df.head()

In [None]:
# drop Name and PassengerId
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

In [None]:
# convert categorical feature to numerical values
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_df.head()