In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from falsb4mpa.dataset.utils import bucket

In [2]:
column_names = [
    'Survived',
    'Pclass', # ticket class (1, 2, 3)
    'Name',
    'Sex',
    'Age', # Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
    'SibSp', # # of siblings / spouses aboard the Titanic
    'Parch', # # of parents / children aboard the Titanic
    'Ticket', # Ticket number
    'Fare', # Passenger fare
    'Cabin', # Cabin number
    'Embarked' # Port of Embarkation
]

In [3]:
used_columns = [
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]
target = 'Survived'
sensitive = 'Sex'

# Reading data

In [5]:
train_data = pd.read_csv("../../data/raw/titanic/titanic_train.csv", index_col='PassengerId')
test_data = pd.read_csv("../../data/raw/titanic/titanic_train.csv", index_col='PassengerId')

In [6]:
data = pd.concat([train_data, test_data])
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
data.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [8]:
print(len(data.index))

1782


### Remove unused columns

In [9]:
data = pd.DataFrame(data, columns=used_columns)

# Removing missing values

In [10]:
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         354
SibSp         0
Parch         0
Fare          0
Embarked      4
dtype: int64

In [11]:
data.dropna(subset=['Survived'], inplace=True) 
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         354
SibSp         0
Parch         0
Fare          0
Embarked      4
dtype: int64

In [12]:
print(len(data.index))

1782


In [13]:
data.dropna(subset=['Embarked'], inplace=True) 
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         354
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [14]:
print(len(data.index))

1778


In [15]:
data['Age'].fillna(-1, inplace=True) # indicate we dont have this data, but dont drop this data points
data.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(-1, inplace=True) # indicate we dont have this data, but dont drop this data points


Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [16]:
print(len(data.index))

1778


# Normalizing continuous data

In [17]:
continous_attr = ['Fare']

# Age

In [18]:
data['Age'].describe()

count    1778.000000
mean       23.541249
std        17.829469
min        -1.000000
25%         6.000000
50%        24.000000
75%        35.000000
max        80.000000
Name: Age, dtype: float64

In [19]:
data['Age'].apply(np.floor).describe()

count    1778.000000
mean       23.525309
std        17.830493
min        -1.000000
25%         6.000000
50%        24.000000
75%        35.000000
max        80.000000
Name: Age, dtype: float64

In [20]:
data['Age'] = data['Age'].apply(np.floor)

In [21]:
data['Age'] = data['Age'].astype(int)

In [22]:
data[data['Age'] == 0]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
79,1,2,male,0,0,2,29.0,S
306,1,1,male,0,1,2,151.55,S
470,1,3,female,0,2,1,19.2583,C
645,1,3,female,0,2,1,19.2583,C
756,1,2,male,0,1,1,14.5,S
804,1,3,male,0,0,1,8.5167,C
832,1,2,male,0,1,1,18.75,S
79,1,2,male,0,0,2,29.0,S
306,1,1,male,0,1,2,151.55,S
470,1,3,female,0,2,1,19.2583,C


#### Bucketing age

In [23]:
data['Age'].head()

PassengerId
1    22
2    38
3    26
4    35
5    35
Name: Age, dtype: int64

In [24]:
buckets = [-1, 2, 12, 18, 25 ,35, 45, 55, 65, 75, 80]

labels = ['Age_' + str(bucket) for bucket in buckets]
dict_labels = {i:labels[i] for i in range(len(labels))}
dict_labels

{0: 'Age_-1',
 1: 'Age_2',
 2: 'Age_12',
 3: 'Age_18',
 4: 'Age_25',
 5: 'Age_35',
 6: 'Age_45',
 7: 'Age_55',
 8: 'Age_65',
 9: 'Age_75',
 10: 'Age_80'}

In [25]:
bucket_train = data['Age'].apply(bucket, buckets=buckets).to_frame()

In [26]:
bucket_train = pd.get_dummies(bucket_train['Age'])
bucket_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False
5,False,False,False,False,False,True,False,False,False,False,False


In [27]:
bucket_train = bucket_train.rename(columns=dict_labels)
bucket_train.head()

Unnamed: 0_level_0,Age_-1,Age_2,Age_12,Age_18,Age_25,Age_35,Age_45,Age_55,Age_65,Age_75,Age_80
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False
5,False,False,False,False,False,True,False,False,False,False,False


In [28]:
data = pd.concat([data, bucket_train], axis=1)
data.drop(['Age'], axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Age_-1,Age_2,Age_12,Age_18,Age_25,Age_35,Age_45,Age_55,Age_65,Age_75,Age_80
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,male,1,0,7.25,S,False,False,False,False,True,False,False,False,False,False,False
2,1,1,female,1,0,71.2833,C,False,False,False,False,False,False,True,False,False,False,False
3,1,3,female,0,0,7.925,S,False,False,False,False,False,True,False,False,False,False,False
4,1,1,female,1,0,53.1,S,False,False,False,False,False,True,False,False,False,False,False
5,0,3,male,0,0,8.05,S,False,False,False,False,False,True,False,False,False,False,False


# Binarizing sex and survived

In [29]:
data['Survived'] = data['Survived'].astype(int)

In [30]:
data['Sex'] = pd.get_dummies(data['Sex'])['female']

In [31]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Age_-1,Age_2,Age_12,Age_18,Age_25,Age_35,Age_45,Age_55,Age_65,Age_75,Age_80
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,False,1,0,7.25,S,False,False,False,False,True,False,False,False,False,False,False
2,1,1,True,1,0,71.2833,C,False,False,False,False,False,False,True,False,False,False,False
3,1,3,True,0,0,7.925,S,False,False,False,False,False,True,False,False,False,False,False
4,1,1,True,1,0,53.1,S,False,False,False,False,False,True,False,False,False,False,False
5,0,3,False,0,0,8.05,S,False,False,False,False,False,True,False,False,False,False,False


# One hot encoding categorical data

In [32]:
categorical_attr = ['Embarked']

In [33]:
for attr in categorical_attr:
    column_idx = data.columns.get_loc(attr)
    data = pd.concat([data, pd.get_dummies(data[attr], prefix=attr)], axis=1)

In [34]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Age_-1,Age_2,Age_12,...,Age_25,Age_35,Age_45,Age_55,Age_65,Age_75,Age_80,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,False,1,0,7.25,S,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2,1,1,True,1,0,71.2833,C,False,False,False,...,False,False,True,False,False,False,False,True,False,False
3,1,3,True,0,0,7.925,S,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,1,1,True,1,0,53.1,S,False,False,False,...,False,True,False,False,False,False,False,False,False,True
5,0,3,False,0,0,8.05,S,False,False,False,...,False,True,False,False,False,False,False,False,False,True


In [35]:
data.drop(categorical_attr, axis=1, inplace=True)

In [36]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Age_-1,Age_2,Age_12,Age_18,Age_25,Age_35,Age_45,Age_55,Age_65,Age_75,Age_80,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,3,False,1,0,7.25,False,False,False,False,True,False,False,False,False,False,False,False,False,True
2,1,1,True,1,0,71.2833,False,False,False,False,False,False,True,False,False,False,False,True,False,False
3,1,3,True,0,0,7.925,False,False,False,False,False,True,False,False,False,False,False,False,False,True
4,1,1,True,1,0,53.1,False,False,False,False,False,True,False,False,False,False,False,False,False,True
5,0,3,False,0,0,8.05,False,False,False,False,False,True,False,False,False,False,False,False,False,True


# Reordering the columns

In [37]:
columns_order = [
    'Pclass',
    'Fare',
    'Embarked_C',
    'Embarked_Q',
    'Embarked_S',
    'Age_-1',
    'Age_2',
    'Age_12',
    'Age_18',
    'Age_25',
    'Age_35',
    'Age_45',
    'Age_55',
    'Age_65',
    'Age_75',
    'Age_80',
    'SibSp',
    'Parch',
    'Sex',
    'Survived'
]

In [38]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Age_-1,Age_2,Age_12,Age_18,Age_25,Age_35,Age_45,Age_55,Age_65,Age_75,Age_80,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,3,False,1,0,7.25,False,False,False,False,True,False,False,False,False,False,False,False,False,True
2,1,1,True,1,0,71.2833,False,False,False,False,False,False,True,False,False,False,False,True,False,False
3,1,3,True,0,0,7.925,False,False,False,False,False,True,False,False,False,False,False,False,False,True
4,1,1,True,1,0,53.1,False,False,False,False,False,True,False,False,False,False,False,False,False,True
5,0,3,False,0,0,8.05,False,False,False,False,False,True,False,False,False,False,False,False,False,True


In [39]:
data = data[columns_order]
data.head()

Unnamed: 0_level_0,Pclass,Fare,Embarked_C,Embarked_Q,Embarked_S,Age_-1,Age_2,Age_12,Age_18,Age_25,Age_35,Age_45,Age_55,Age_65,Age_75,Age_80,SibSp,Parch,Sex,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,3,7.25,False,False,True,False,False,False,False,True,False,False,False,False,False,False,1,0,False,0
2,1,71.2833,True,False,False,False,False,False,False,False,False,True,False,False,False,False,1,0,True,1
3,3,7.925,False,False,True,False,False,False,False,False,True,False,False,False,False,False,0,0,True,1
4,1,53.1,False,False,True,False,False,False,False,False,True,False,False,False,False,False,1,0,True,1
5,3,8.05,False,False,True,False,False,False,False,False,True,False,False,False,False,False,0,0,False,0


# Saving data

In [40]:
print(len(data.index))

1778


In [43]:
data.to_csv('../../data/processed/titanic/titanic.csv')