# Getting started. Load data and give it a preliminary cleanup

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('./data/titanic3.csv')

In [None]:
data.dtypes

In [None]:
# CSV data has "?" in columns that should be numerical.
# let's get rid of all those and replace them with nan. This way we will be able to cast the Series to a numerical type
data.replace('?', np.nan, inplace= True)
data = data.astype({"age": np.float64, "fare": np.float64})

In [None]:
data.dtypes

## Quick look at the data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axs = plt.subplots(ncols=5, figsize=(30,5))
sns.violinplot(x="survived", y="age", hue="sex", data=data, ax=axs[0])
sns.pointplot(x="sibsp", y="survived", hue="sex", data=data, ax=axs[1])
sns.pointplot(x="parch", y="survived", hue="sex", data=data, ax=axs[2])
sns.pointplot(x="pclass", y="survived", hue="sex", data=data, ax=axs[3])
sns.violinplot(x="survived", y="fare", hue="sex", data=data, ax=axs[4])

## Use Pandas to calculate correlations between variables
### First, turn any string columns to numerical to be allow Pandas to calculate the correlations

In [None]:
data.replace({'male': 1, 'female': 0}, inplace=True)

### Now lets check it out!

In [None]:
# What do your raw attributes say about your chances?
data.corr().abs()[["survived"]]


In [None]:
# use existing attributes to create new ones. let's see if having a relative abouard impacts survival.

data['relatives'] = data.apply(lambda row: int((row['sibsp'] + row['parch']) > 0), axis=1)
data.corr().abs()[["survived"]]

In [None]:
# data.to_csv('titanic3_clean.csv')