In [None]:
import pandas as pd
import seaborn as sns

# Titanic Data

### Description

The sinking of the RMS Titanic is one of the most infamous shipwrecks in history.  On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew (32% survival rate). 

What sorts of people were more likely to survive?

In [None]:
df = pd.read_csv("titanic.csv")
df.head()

---

# Exploratory Data Analysis

In [None]:
import warnings
warnings.filterwarnings("ignore")

sns.kdeplot(data=df, x=df.loc[(df.Age.notnull()) & (df["Survived"] == 0),'Age'],
                 hue = "Survived", fill=True)
sns.kdeplot(data=df, x=df.loc[(df.Age.notnull()) & (df["Survived"] == 1),'Age'],
                 hue = "Survived", fill=True);

## Analyze by grouping features

In [None]:
# Mean survival rate by passenger class

df[['Pclass', 'Survived']].groupby(['Pclass']).mean()

In [None]:
# Mean survival rate by sex

df[['Sex', 'Survived']].groupby(['Sex']).mean()

In [None]:
# Mean survival rate by number of parents/children traveled with

df[['Parch', 'Survived']].groupby(['Parch']).mean().sort_values(by='Survived', ascending=False)

In [None]:
# Mean survival rate by number of siblings/spouse traveled with

df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=True).mean().sort_values(by='Survived', ascending=False)


## Data Analysis: 
#### It appears that women, children, the upperclass, and those traveling with at least one other person, but no more than 3, had the best chances to survive the Titanic tragedy.

---

# Feature selection
Check for missing values. Which features can be dropped?

In [None]:
df.info()

## Drop poor features (columns)

In [None]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1)
df.head()

---

# Data cleaning and tranformation

## Binary encoding ("Sex")

In [None]:
df["Sex"].unique()

In [None]:
# Converting a categorical feature to a binary one

df["Sex"] = df["Sex"].replace({'male':0, 'female':1})
df.head()

## One-hot encoding ("Embarked")

In [None]:
df = pd.get_dummies(df, columns=["Embarked"], dtype=int)
df.head()

# Impute missing values ("Age")

## Impute using the mean

### The mean age of all passengers

In [None]:
# Round the age to be a whole number

round(df["Age"].mean())

### Mean age of males in First Class

In [None]:
Males_Pclass1_Age_mean = round(df.loc[(df['Sex']==0) & (df['Pclass']==1), 'Age'].mean())
Males_Pclass1_Age_mean

### Mean age of males in Second Class

In [None]:
Males_Pclass2_Age_mean = round(df.loc[(df['Sex']==0) & (df['Pclass']==2), 'Age'].mean())
Males_Pclass2_Age_mean

### Mean age of males in Third Class

In [None]:
Males_Pclass3_Age_mean = round(df.loc[(df['Sex']==0) & (df['Pclass']==3), 'Age'].mean())
Males_Pclass3_Age_mean

### Replace missing ages with their respective group's mean

In [None]:
df.loc[ (df.Sex==0) & (df.Pclass==1) & (df["Age"].isnull()),'Age'] = Males_Pclass1_Age_mean
df.loc[ (df.Sex==0) & (df.Pclass==2) & (df["Age"].isnull()),'Age'] = Males_Pclass2_Age_mean
df.loc[ (df.Sex==0) & (df.Pclass==3) & (df["Age"].isnull()),'Age'] = Males_Pclass3_Age_mean

df.head()

---

# Feature Engineering

## "AgeGroup"
### Perhaps create an "AgeGroup" feature by grouping "Age" within bands (discretization).

In [None]:
df["Age"].head(10)

In [None]:
# Create "AgeGroup" feature

df["AgeGroup"] = pd.cut(df["Age"], [0,18,40,60,80], labels=["child","adult","middle age","elder"])
df["AgeGroup"].head(10)

### Explore groupings

In [None]:
df[["AgeGroup", "Sex", "Survived"]].groupby(["AgeGroup", "Sex"]).mean().sort_values(by="AgeGroup", ascending=True)

### Converting "AgeGroup" to ordinal numbers

In [None]:
df["AgeGroup"] = df["AgeGroup"].replace({'child':0, 'adult':1, "middle age":2, "elder":3})
df.head(10)

## "FamilySize"
### Perhaps create a **"FamilySize"** feature, combining "SibSp" and "Parch"

In [None]:
# Create "FamilySize" feature
 
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df.head()

### Explore groupings

In [None]:
df[['FamilySize', 'Survived']].groupby(['FamilySize']).mean().sort_values(by='Survived', ascending=False)

## "IsAlone"
### Perhaps create an "IsAlone" feature

In [None]:
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
df.head()

### Explore groupings

In [None]:
df[['IsAlone', 'Survived']].groupby(['IsAlone']).mean()

# Final Feature Selection
We dropped 6 of our original 11 features.

In [None]:
df = df.drop(['Age', 'SibSp', 'Parch', 'IsAlone'], axis=1)
df.head()

---

# Machine Learning ...