## 1. Import libraries

In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import sklearn
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pickle
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

## 2. Data collection

In [97]:
# define url
url = 'https://en.wikipedia.org/wiki/Passengers_of_the_Titanic'
response = requests.get(url)

# define the attributes for data
Name = []
Age = []
Hometown = []
Boarded = []
Destination = []
Lifeboat = []
Class = []
Survive = []

# check if the url is ok or not
if (response.status_code != 200):
    print('Error')
else:
    url_data = response.text
    soup = BeautifulSoup(url_data, 'html.parser')

    # find tags data
    passenger_tags = soup.find_all('table', {'class':'wikitable sortable'})
    clss = 0
    span_cnt = np.full(6, 0, dtype = int)
    # each passenger_tag is passengers with each class
    for passenger_tag in passenger_tags:
        tr_tags = passenger_tag.find_all('tr')
        
        clss += 1
        check = np.full(6, True, dtype = bool)
        cnt = 0
        for tr_tag in tr_tags:
            # each tr_tag is a passenger data
            td_tags = tr_tag.find_all('td')
            
            if not td_tags:
                continue

            if tr_tag.get('style'):
                Survive.append(1)
            else:
                Survive.append(0)

            # each td_tag is a passenger attributes
            i = 0
            cnt += 1
            for td_tag in td_tags:
                
                row = int(td_tag.get('rowspan')) if td_tag.get('rowspan') else 1

                while i < 6 and check[i] == False:
                    span_cnt[i] -= 1
                    if span_cnt[i] == 0:
                        check[i] = True
                    i += 1

                
                if i >= 6:
                    break
                
                if check[i] == True:
                    span_cnt[i] += row-1
                    if span_cnt[i] > 0:
                        check[i] = False
                    
                    if i == 0:
                        Class.append(clss)
                        while row > 0:
                            Name.append(td_tag.text) if td_tag else Name.append(np.nan)
                            row -= 1
                    elif i == 1:
                        while row > 0:
                            Age.append(td_tag.text) if td_tag else Age.append(np.nan)
                            row -= 1
                    elif i == 2:
                        while row > 0:
                            Hometown.append(td_tag.text) if td_tag else Hometown.append(np.nan)
                            row -= 1
                    elif i == 3:
                        while row > 0:
                            Boarded.append(td_tag.text) if td_tag else Boarded.append(np.nan)
                            row -= 1
                    elif i == 4:
                        while row > 0:
                            Destination.append(td_tag.text) if td_tag else Destination.append(np.nan)
                            row -= 1      
                    elif i == 5:
                        while row > 0:
                            Lifeboat.append(td_tag.text) if td_tag else Lifeboat.append(np.nan)
                            row -= 1
                    i+=1
            
            while i < 6 and check[i] == False:
                    span_cnt[i] -= 1
                    if span_cnt[i] == 0:
                        check[i] = True
                    i += 1

raw_df = pd.DataFrame({'Name':Name,
                       'Age': Age,
                       'Hometown': Hometown,
                       'Boarded': Boarded,
                       'Destination': Destination,
                       'Lifeboat': Lifeboat,
                       'Class': Class,
                       'Survive': Survive})

# export raw data as raw_data.csv
raw_df.to_csv('./data/raw_data.csv', index = False)

## 3. Data cleaning

In [60]:
# read data
df = pd.read_csv('./data/raw_data.csv')

# make a deep copy of df_val
df_cp = pickle.loads(pickle.dumps(df))
df_cp.head()

Unnamed: 0,Name,Age,Hometown,Boarded,Destination,Lifeboat,Class,Survive
0,"Allen, Miss Elizabeth Walton",29,"St Louis, Missouri, US",Southampton,St Louis,\n,1,1
1,"Allison, Mr. Hudson Creighton",30,"Montreal, Quebec, Canada",Southampton,"Montreal, Quebec, Canada",,1,0
2,"and chauffeur, Mr. George Swane[71]",19,,Southampton,"Montreal, Quebec, Canada",,1,0
3,"and cook, Miss Amelia Mary ""Mildred"" Brown[71]",18,"London, England, UK",Southampton,"Montreal, Quebec, Canada",11,1,1
4,"Allison, Mrs. Bessie Waldo (née Daniels)",25,"Montreal, Quebec, Canada",Southampton,"Montreal, Quebec, Canada",,1,0


Let's get to know our data:

* **Name**: the name of the passenger.
* **Age**: the age of the passenger.
* **Hometown**: the hometown of the passenger.
* **Boarded**: the place where passenger get on the ship.
* **Destination**: the place where passenger want to go.
* **Lifeboat**: the lifeboat that the passenger get on.
* **Class**: the class of the passenger.
* **Survive**: whether they survive or not.

In [61]:
df_cp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         1312 non-null   object
 1   Age          1312 non-null   object
 2   Hometown     1311 non-null   object
 3   Boarded      1309 non-null   object
 4   Destination  1312 non-null   object
 5   Lifeboat     1034 non-null   object
 6   Class        1312 non-null   int64 
 7   Survive      1312 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 82.1+ KB


As we can see, there are missing values in **'Hometown'**, **'Boarded'**, and **'Lifeboat'** columns. And **'Age'** must be integers typ instead of object. So we might change it into integers.

In [62]:
df_cp['Age'] = pd.to_numeric(df_cp['Age'], errors = 'coerce').astype('Int64')
df_cp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         1312 non-null   object
 1   Age          1300 non-null   Int64 
 2   Hometown     1311 non-null   object
 3   Boarded      1309 non-null   object
 4   Destination  1312 non-null   object
 5   Lifeboat     1034 non-null   object
 6   Class        1312 non-null   int64 
 7   Survive      1312 non-null   int64 
dtypes: Int64(1), int64(2), object(5)
memory usage: 83.4+ KB


After converting to type integers, there are 12 missing values because these values cannot be converted into integers properly. There are only *12/1312 = 0.91%* in the data, so we can eliminate those **NULL values** by dropping rows contain **NULL values**.

In [63]:
df_cp.dropna(subset = ['Age'], inplace = True)
df_cp.isnull().sum()

Name             0
Age              0
Hometown         1
Boarded          3
Destination      0
Lifeboat       278
Class            0
Survive          0
dtype: int64

**'Lifeboat'** have 278 missing values, but for lifeboat we might only consider that the passenger is on a lifeboat or not. The number or letter that signed on the lifeboat is not so important. So the missing values mean that the passengers do not have lifeboat. We will convert the values of **'Lifeboat'** columns to **0** and **1**.
* **1**: had lifeboat.
* **0**: did not have lifeboat.

In [68]:
df_cp.loc[df_cp['Lifeboat'].notna(), 'Lifeboat'] = 1
df_cp['Lifeboat'] = df_cp['Lifeboat'].fillna(0)
df_cp['Lifeboat'] = df_cp['Lifeboat'].astype('Int64')
df_cp.isnull().sum()

Name           0
Age            0
Hometown       1
Boarded        3
Destination    0
Lifeboat       0
Class          0
Survive        0
dtype: int64

Just like the previous column, there are only 4 missing values so we could eliminate them easily by dropping these rows.

In [72]:
df_cp.dropna(subset = ['Hometown', 'Boarded'], inplace = True)
df_cp.isnull().sum()

Name           0
Age            0
Hometown       0
Boarded        0
Destination    0
Lifeboat       0
Class          0
Survive        0
dtype: int64

In [77]:
df_cp.describe()

Unnamed: 0,Age,Lifeboat,Class,Survive
count,1296.0,1296.0,1296.0,1296.0
mean,29.64429,0.787809,2.290123,0.378858
std,13.538306,0.409017,0.840534,0.48529
min,1.0,0.0,1.0,0.0
25%,21.0,1.0,1.75,0.0
50%,28.0,1.0,3.0,0.0
75%,37.25,1.0,3.0,1.0
max,74.0,1.0,3.0,1.0


We check all the numeric value columns so that if they have outliers. But in this situation, there might not be any outliers in these columns.

So let's take a look on these columns to check if we could do some feature engineerings.

At the **'Name'** column, we can see that the title **'Miss'**, **'Mr'**, **'Master'**, ... may be at **'Master'** level, they might have a higher a chance of surviving. So we will create a new columns called **'Title'** to contain these values.