# Mini project: Exploratory Data Analysis (EDA)

In [None]:
# Exploratory Data Analysis in the Titanic dataset

# We will use the dataprep python package that facilitates EDA.

In [None]:
# Dataset description

# Variables:
# Survived (class): whether survived (0,1)
# pclass: ticket class (A proxy for socio-economic status - 1st = Upper, 2nd = Middle, 3rd = Lower)
# sex
# age
# sibsp: number of siblinks/spouses aboard
# parch: number of parents/children aboard
# ticket: the ticket number
# fare: the passenger fare
# cabin: cabin number
# embarked: port of embarkation
# passengerid: the ID of the record
# name

In [None]:
# Load libraries
# Note that we do not load specific libs that we used to load in other examples:
# pandas and plotting libs are hidden in dataprep.eda module

from dataprep.eda import *
from dataprep.datasets import load_dataset
from dataprep.eda import plot, plot_correlation, plot_missing, plot_diff, create_report

In [None]:
# Load the dataset

# This is a well-known dataset. It is directly provided in the library.
# We can fetch the dataset directly via the load_dataset() function
# which will return the data in a dataframe 

df = load_dataset("titanic")

In [None]:
# We have loaded the dataset in a pandas dataframe.
# Let's take a look
df

In [None]:
# Overview of the data
plot(df)

In [None]:
# Initial findings:
# Class distribution not balanced
# No duplicates
# The numeric attributes are: age, sibsp, parch, fare

In [None]:
# Check for missing values

plot_missing(df)

In [None]:
# There are 3 attributes with missing values: age, cabin, embarked

In [None]:
# Let's see the impact of the attributes that have missing values
plot(df, 'Age', 'Survived').show()

In [None]:
# Seems that younger people are more likely to survive. 

In [None]:
# Let's try a different plot
plot_missing(df, "Age", "Survived")

In [None]:
# Let's see the other features that have missing values (cabin, embarked)
for feature in ['Cabin', 'Embarked']:
    plot(df, feature, 'Survived').show()


In [None]:
# Comments:
# The effect of cabin to Survived is not clear
# Embarked seems correlated. When embarked = C, more likely to be Survived. When S more likely not to be Survived

# So, what we could do with the above 3 features?
# -> keep Age and Embarked (find a strategy to fill missing values)
# -> remove Cabin

In [None]:
# Let's see other features: Pclass, Sex, Fare (those intuitively should affect the Survived class)

for feature in ['Pclass', 'Sex', 'Fare']:
    plot(df, feature, 'Survived').show()

In [None]:
# Comments:
# The upper-class passengers, the female, and people with high Fare are more likely to be survived. 
# We should include these three features in our model if we want to predict the Survived class.

In [None]:
# Let's get a correlation overview
plot_correlation(df)

In [None]:
# Comments:
# The most correlated attributes are Parch and SibSp.
# No any two attributes are highly correlated. So, don't need to worry about correlated features. 

# Parch and SibSp are slightly correlated in both computation and semantics.
# -> we could construct a new feature named Relatives, based on Parch and SibSp, 
# that counts the total members for each passenger ;-)


In [None]:
# Another observation is that Survived is positive (slightly) correlated with Fare.
# Let's see the correlation of one attribute (Fare) to the others
plot_correlation(df, "Fare")

In [None]:
# Let's see the relationship in terms of instances between two attributes

plot(df, "Pclass", "Fare")

In [None]:
# Let's see some exploration of categorical attributes
plot(df, "Sex")

In [None]:
plot(df, "Name")

In [None]:
# Create a report with the overall overview
create_report(df)