In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from data_prep_func import bucket

In [2]:
column_names = [
    'Survived',
    'Pclass', # ticket class (1, 2, 3)
    'Name',
    'Sex',
    'Age', # Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
    'SibSp', # # of siblings / spouses aboard the Titanic
    'Parch', # # of parents / children aboard the Titanic
    'Ticket', # Ticket number
    'Fare', # Passenger fare
    'Cabin', # Cabin number
    'Embarked' # Port of Embarkation
]

In [3]:
used_columns = [
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]
target = 'Survived'
sensitive = 'Sex'

# Reading data

In [4]:
train_data = pd.read_csv('/home/luiz/ufpb/mestrado/code/falsb/benchmark/data/titanic/titanic_train.csv', index_col='PassengerId')
test_data = pd.read_csv('/home/luiz/ufpb/mestrado/code/falsb/benchmark/data/titanic/titanic_test.csv', index_col='PassengerId')

In [5]:
data = pd.concat([train_data, test_data])
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
data.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [7]:
print(len(data.index))

1309


### Remove unused columns

In [8]:
data = pd.DataFrame(data, columns=used_columns)

# Removing missing values

In [9]:
data.isna().sum()

Survived    418
Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
Embarked      2
dtype: int64

In [10]:
data.dropna(subset=['Survived'], inplace=True) 
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [11]:
print(len(data.index))

891


In [12]:
data.dropna(subset=['Embarked'], inplace=True) 
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [13]:
print(len(data.index))

889


In [14]:
data.dropna(subset=['Age'], inplace=True) 
data.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [None]:
print(len(data.index))

712


# Normalizing continuous data

In [15]:
continous_attr = ['Fare']

# Age

In [16]:
data['Age'].describe()

count    712.000000
mean      29.642093
std       14.492933
min        0.420000
25%       20.000000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [None]:
data['Age'].apply(np.floor).describe()

count    712.000000
mean      29.622191
std       14.502891
min        0.000000
25%       20.000000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [17]:
data['Age'] = data['Age'].apply(np.floor)

In [18]:
data['Age'] = data['Age'].astype(int)

In [19]:
data[data['Age'] < 1]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
79,1.0,2,male,0,0,2,29.0,S
306,1.0,1,male,0,1,2,151.55,S
470,1.0,3,female,0,2,1,19.2583,C
645,1.0,3,female,0,2,1,19.2583,C
756,1.0,2,male,0,1,1,14.5,S
804,1.0,3,male,0,0,1,8.5167,C
832,1.0,2,male,0,1,1,18.75,S


# Binarizing sex and income

In [20]:
data['Survived'] = data['Survived'].astype(int)

In [21]:
data['Sex'] = pd.get_dummies(data['Sex'])['female']

In [22]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,0,22,1,0,7.25,S
2,1,1,1,38,1,0,71.2833,C
3,1,3,1,26,0,0,7.925,S
4,1,1,1,35,1,0,53.1,S
5,0,3,0,35,0,0,8.05,S


# One hot encoding categorical data

In [23]:
categorical_attr = ['Embarked']

In [24]:
for attr in categorical_attr:
    column_idx = data.columns.get_loc(attr)
    data = pd.concat([data, pd.get_dummies(data[attr], prefix=attr)], axis=1)

In [25]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,0,22,1,0,7.25,S,0,0,1
2,1,1,1,38,1,0,71.2833,C,1,0,0
3,1,3,1,26,0,0,7.925,S,0,0,1
4,1,1,1,35,1,0,53.1,S,0,0,1
5,0,3,0,35,0,0,8.05,S,0,0,1


In [26]:
data.drop(categorical_attr, axis=1, inplace=True)

In [27]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,0,22,1,0,7.25,0,0,1
2,1,1,1,38,1,0,71.2833,1,0,0
3,1,3,1,26,0,0,7.925,0,0,1
4,1,1,1,35,1,0,53.1,0,0,1
5,0,3,0,35,0,0,8.05,0,0,1


# Reordering the columns

In [28]:
columns_order = [
    'Pclass',
    'Fare',
    'Embarked_C',
    'Embarked_Q',
    'Embarked_S',
    'Age',
    'SibSp',
    'Parch',
    'Sex',
    'Survived'
]

In [29]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,0,22,1,0,7.25,0,0,1
2,1,1,1,38,1,0,71.2833,1,0,0
3,1,3,1,26,0,0,7.925,0,0,1
4,1,1,1,35,1,0,53.1,0,0,1
5,0,3,0,35,0,0,8.05,0,0,1


In [30]:
data = data[columns_order]
data.head()

Unnamed: 0_level_0,Pclass,Fare,Embarked_C,Embarked_Q,Embarked_S,Age,SibSp,Parch,Sex,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,7.25,0,0,1,22,1,0,0,0
2,1,71.2833,1,0,0,38,1,0,1,1
3,3,7.925,0,0,1,26,0,0,1,1
4,1,53.1,0,0,1,35,1,0,1,1
5,3,8.05,0,0,1,35,0,0,0,0


# Saving data

In [31]:
print(len(data.index))

712


In [32]:
data.to_csv('../data/titanic/post_prep/titanic.csv')