# Imports

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from data_gadgets.cleaning import Cleaner

# Reading Data

In [3]:
path = os.path.join('..', '..', 'data', 'raw', 'data_task1.csv')
data = pd.read_csv(path)

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Cleaning Names

In [5]:
cleaner = Cleaner()

In [6]:
data = cleaner.headers(data)

In [7]:
cols = cleaner.separate_data(data, 'survived')

In [8]:
data = cleaner.categories(data)

In [9]:
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,892,0,3,"Kelly, Mr. James",Male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",Female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",Male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",Male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",Female,22.0,1,1,3101298,12.2875,,S


# Filling Missing Values

In [10]:
data['age'] = data.age.fillna(data.age.median())
data['fare'] = data.fare.fillna(data.fare.median())

In [11]:
cabin = []
for i in data.cabin:
    cabin.append(str(i)[0].title())
data['cabin'] = cabin

In [12]:
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,892,0,3,"Kelly, Mr. James",Male,34.5,0,0,330911,7.8292,N,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",Female,47.0,1,0,363272,7.0,N,S
2,894,0,2,"Myles, Mr. Thomas Francis",Male,62.0,0,0,240276,9.6875,N,Q
3,895,0,3,"Wirz, Mr. Albert",Male,27.0,0,0,315154,8.6625,N,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",Female,22.0,1,1,3101298,12.2875,N,S


# Merging Values

In [13]:
sibsp = []
for value in data.sibsp:
    if value > 1:
        value = 1
    sibsp.append(value)
data['sibsp'] = sibsp

In [14]:
parch = []
for value in data.parch:
    if value > 1:
        value = 1
    parch.append(value)
data['parch'] = parch 

In [15]:
family = []
for value1, value2 in zip(data.sibsp, data.parch):
    if value1 == 1 or value2 == 1:
        family.append(1)
    else:
        family.append(0)
data['family'] = family

In [16]:
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,family
0,892,0,3,"Kelly, Mr. James",Male,34.5,0,0,330911,7.8292,N,Q,0
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",Female,47.0,1,0,363272,7.0,N,S,1
2,894,0,2,"Myles, Mr. Thomas Francis",Male,62.0,0,0,240276,9.6875,N,Q,0
3,895,0,3,"Wirz, Mr. Albert",Male,27.0,0,0,315154,8.6625,N,S,0
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",Female,22.0,1,1,3101298,12.2875,N,S,1


# Dropping Values

In [17]:
data = data.drop(['name', 'passengerid', 'ticket', 'sibsp', 'parch'], axis=1)

In [18]:
data.head()

Unnamed: 0,survived,pclass,sex,age,fare,cabin,embarked,family
0,0,3,Male,34.5,7.8292,N,Q,0
1,1,3,Female,47.0,7.0,N,S,1
2,0,2,Male,62.0,9.6875,N,Q,0
3,0,3,Male,27.0,8.6625,N,S,0
4,1,3,Female,22.0,12.2875,N,S,1


# Encoding Categories

In [19]:
data = pd.get_dummies(data, drop_first=True, dtype=int)

In [20]:
data.head()

Unnamed: 0,survived,pclass,age,fare,family,sex_Male,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_N,embarked_Q,embarked_S
0,0,3,34.5,7.8292,0,1,0,0,0,0,0,0,1,1,0
1,1,3,47.0,7.0,1,0,0,0,0,0,0,0,1,0,1
2,0,2,62.0,9.6875,0,1,0,0,0,0,0,0,1,1,0
3,0,3,27.0,8.6625,0,1,0,0,0,0,0,0,1,0,1
4,1,3,22.0,12.2875,1,0,0,0,0,0,0,0,1,0,1


# Saving Data

In [21]:
# data.to_csv('../../data/interim/data_task1.csv', index=False)