# Importing dependencies

In [2]:
%matplotlib inline

# Statistics
from scipy.stats import zscore

# ML
from sklearn.preprocessing import MinMaxScaler, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, SimpleImputer
from sklearn.cluster import DBSCAN

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use("seaborn-v0_8-whitegrid")


ImportError: cannot import name 'SimpleImputer' from 'sklearn.preprocessing' (/Users/harrisonhenri/Documents/projetos/ml/ml-notes/python/venv/lib/python3.9/site-packages/sklearn/preprocessing/__init__.py)

# Loading data

In [None]:
data = pd.read_csv('titanic_dataset.csv')
data.head(5)

In [None]:
# Missing values
missingno.matrix(data, figsize = (30,10))

In [None]:
# Data types
data.dtypes

# Feature Cleaning, Engineering, and Imputation

## Feature: Survided
Description: Whether the passenger survived or not.

Key: 0 = did not survive, 1 = survived

In [None]:
# How many people survived?
fig = plt.figure(figsize=(20,1))
sns.countplot(y='Survived', data=data);

## Feature: Pclass
Description: The ticket class of the passenger.

Key: 1 = 1st, 2 = 2nd, 3 = 3rd

In [None]:
sns.distplot(data.Pclass)

In [None]:
# Are there any missing values in the Pclass column?
data.Pclass.isnull().sum()

## Feature: Name
Description: The name of the passenger.

To keep this EDA fast, we won't move forward using the name variable.

## Feature: Sex
    
Description: The sex of the passenger (male or female).

In [None]:
# Let's view the distribution of Sex
plt.figure(figsize=(20, 5))
sns.countplot(y="Sex", data=data);

In [None]:
# Are there any missing values in the Pclass column?
data.Sex.isnull().sum()

In [None]:
# How does the Sex variable look compared to Survival?
# We can see this because they're both binarys.
fig = plt.figure(figsize=(10, 10))
sex_df = data[['Survived', 'Sex']]
sex_df['Sex'] = np.where(sex_df['Sex'] == 'female', 1, 0)

sns.distplot(sex_df.loc[sex_df['Survived'] == 1]['Sex'], kde_kws={'label': 'Survived'})
sns.distplot(sex_df.loc[sex_df['Survived'] == 0]['Sex'], kde_kws={'label': 'Did not survive'})
plt.legend()

## Feature: Age
Description: The age of the passenger.

In [None]:
# How many missing values does age have?
data.Age.isnull().sum()

## Feature: SibSp
Description: The number of siblings/spouses the passenger has aboard the Titanic.

In [None]:
# How many missing values does SibSp have?
data.SibSp.isnull().sum()

In [None]:
# How does the SibSp variable look compared to Survival?
# We can see this because they're both binarys.
fig = plt.figure(figsize=(10, 10))
sibsp_df = data[['Survived', 'SibSp']]

sns.distplot(sibsp_df.loc[sex_df['Survived'] == 1]['SibSp'], kde_kws={'label': 'Survived'})
sns.distplot(sibsp_df.loc[sex_df['Survived'] == 0]['SibSp'], kde_kws={'label': 'Did not survive'})
plt.legend()

## Feature: Parch
Description: The number of parents/children the passenger has aboard the Titanic.

In [None]:
# How many missing values does SibSp have?
data.Parch.isnull().sum()

In [None]:
# How does the Parch variable look compared to Survival?
# We can see this because they're both binarys.
fig = plt.figure(figsize=(10, 10))
parch_df = data[['Survived', 'Parch']]

sns.distplot(parch_df.loc[parch_df['Survived'] == 1]['Parch'], kde_kws={'label': 'Survived'})
sns.distplot(parch_df.loc[parch_df['Survived'] == 0]['Parch'], kde_kws={'label': 'Did not survive'})
plt.legend()

## Feature: Ticket
Description: The ticket number of the boarding passenger.

In [None]:
# How many missing values does Ticket have?
data.Ticket.isnull().sum()

In [None]:
# How many unique kinds of Ticket are there?
print("There are {} unique Ticket values.".format(len(data.Ticket.unique())))

681 unique values is too many for now. So we won't use Ticket in our subset dataframes.

## Feature: Fare
Description: How much the ticket cost.

In [None]:
# How many different values of Fare are there?
sns.countplot(y="Fare", data=data);

## Feature: Cabin
Description: The cabin number where the passenger was staying.

In [None]:
# How many missing values does Cabin have?
data.Cabin.isnull().sum()

Since there are too many missing values, we won't use Cabin.

## Feature: Embarked
Description: The port where the passenger boarded the Titanic.

Key: C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
# What do the counts look like?
sns.countplot(y='Embarked', data=data);

## Heatmap or pairplot

In [None]:
age_fare = data[["Age", "Fare"]]

In [None]:
sns.heatmap(data=age_fare.corr()) 

In [None]:
sns.pairplot(age_fare)

## Outline detection

### Z-score

In [None]:
age = data["Age"].dropna()
age_zscore = zscore(age)

# Confidence level: 99.4% 
is_outlier = age_zscore.apply(
  lambda x: x <= -2.5 or x >= 2.5
)

age[is_outlier]

### DBSCAN

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
age_fare_scaled = scaler.fit_transform(age_fare.dropna())

outlier_detection = DBSCAN(
  eps = 0.5,
  metric="euclidean",
  n_jobs = -1)

clusters = outlier_detection.fit_predict(age_fare_scaled)

age_fare_cleaned = age_fare.dropna()[clusters!=-1]

age_fare_cleaned

In [None]:
## Handling missing data

In [None]:
imp = SimpleImputer(strategy='mean')
age_fare_cleaned = imp.fit_transform(age_fare)
age_fare_cleaned