In [1]:
import os
import sys

import yaml
import pandas as pd
from ydata_profiling import ProfileReport


os.chdir('..')
sys.path.append(os.getcwd())
from src.data_understanding import describe_numerical, describe_categorical

# Initial configurations
config_filepath = 'config.yml'

with open(config_filepath, 'r') as f:
    config = yaml.safe_load(f)

# Data Understanding

## 1 Data Loading

In [2]:
data = pd.read_csv(config['filepaths']['data'])

In [3]:
len(data)

891

## 2 Data Description

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

| **Column** | **Description** |
| ---------- | --------------- |
| `PassengerId` | ID of the passenger |
| `Survived` | Whether the titanic passenger survived |
| `Pclass` | Ticket class |
| `Name` | Name of the passenger |
| `Sex` | Sex |
| `Age` | Age of the passenger in years |
| `SibSp` | Number of siblings aboard the Titanic |
| `Parch` | Number of parents or children aboard the Titanic |
| `Ticket` | Ticket number of the passenger |
| `Fare` | Passenger fare |
| `Cabin` | Cabin number |
| `Embarked` | Port of Embarkation |

In [6]:
numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_cols = ['Survived', 'Pclass', 'Sex', 'Embarked']

## 3 Data Consistency

In [7]:
data.loc[:, 'Age'] = data.loc[:, 'Age'].round()

In [8]:
data_types = {
    'Age': pd.Int64Dtype(),
}

In [9]:
data = data.astype(data_types, copy=True)

In [10]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [11]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age              Int64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## 4 Data Exploration

In [12]:
numerical_eda = describe_numerical(data, numerical_cols)

In [13]:
numerical_eda

Unnamed: 0,count,% missing,min,25%,mean,50%,75%,max,std
Age,714.0,19.86532,0.0,20.0,29.693277,28.0,38.0,80.0,14.524527
SibSp,891.0,0.0,0.0,0.0,0.523008,0.0,1.0,8.0,1.102743
Parch,891.0,0.0,0.0,0.0,0.381594,0.0,0.0,6.0,0.806057
Fare,891.0,0.0,0.0,7.9104,32.204208,14.4542,31.0,512.3292,49.693429


In [14]:
categorical_eda = describe_categorical(data, categorical_cols)

In [15]:
categorical_eda

Unnamed: 0,count,% missing,nunique,mode,mode freq,mode %,2nd mode,2nd mode freq,2nd mode %
Survived,891,0.0,2,0.0,549,61.616162,1.0,342,61.616162
Pclass,891,0.0,3,3.0,491,55.106622,1.0,216,55.106622
Sex,891,0.0,2,male,577,64.758698,female,314,64.758698
Embarked,889,0.224467,3,S,644,72.278339,C,168,72.278339


In [16]:
profile = ProfileReport(data, title="Data Exploration")

In [17]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|          | 0/12 [00:00<?, ?it/s]
100%|██████████| 12/12 [00:00<00:00, 59.50it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## 5 Data Quality Assessment

1. Are there any data sources or features we anticipate will be challenging to process? If so, what considerations or precautions should we keep in mind?
    - 👩‍💻 How do you resolve missing values for non-negotiable or important features? Are there existing business definitions that you could use to impute these missing values? Or should you resort to using imputation algorithms?
    - 👩‍💻 How do you resolve features that we suspect to have contain anomalies or outliers? How should these be addressed -- by filtering them out, applying minimum/maximum caps, or using other methods?
2. What specific data science techniques or methodologies do you envision using for this project? (e.g., statistical analysis, **predictive modeling**, optimization)
   - 👩‍💻 What are the model performance metrics you should focus on? How does this translate to a business scorecard metric or a business priority?