# Read Dataset

In [1]:
import pandas as pd
# pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

df = pd.read_csv("../datasets/raw/train.csv", index_col="PassengerId")
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Information

In [2]:
df.rename(
    columns={
        old: old.lower() for old in df.columns
    },
    inplace=True
)

df.columns

Index(['survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked'],
      dtype='object')

# Transformation

## title

In [3]:
df_transform = df.copy()

In [4]:
df_transform["title"] = df_transform.name.apply(lambda x: x.split(".")[0].split(",")[1].strip())

def replace_title(df):
    title = df["title"]
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', "Sir"]:
        return "Mr"
    elif title in ['the Countess', 'Mme', "Lady", "Dona"]:
        return "Mrs"
    elif title in ['Mlle', 'Ms']:
        return "Miss"
    elif title == "Dr":
        if df["sex"] == "male":
            return "Mr"
        else:
            return "Mrs"
    else:
        return title
    
df_transform["title"] = df_transform.apply(replace_title, axis=1)

## Deck

In [5]:
def deck_category(cabin):
    if isinstance(cabin, float):
        return "unknown"
    elif "A" in cabin:
        return "A"
    elif "B" in cabin:
        return "B"
    elif "C" in cabin:
        return "C"
    elif "D" in cabin:
        return "D"
    elif "E" in cabin:
        return "E"
    elif "F" in cabin:
        return "F"
    elif "G" in cabin:
        return "G"
    elif "T" in cabin:
        return "T"
    elif "F" in cabin and "E" in cabin:
        return "FE"
    elif "F" in cabin and "G" in cabin:
        return "FG"
    
df_transform["deck"] = df_transform.cabin.apply(deck_category)
df_transform[:2]

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,unknown
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C


## fare

In [6]:
def transform_fare(fare):
    if fare <= 50:
        return 1
    elif 50 < fare <= 150:
        return 2
    else:
        return 3

df_transform["fare_category"] = df_transform.fare.apply(transform_fare)

In [7]:
df_transform["no_ticket"] = df_transform.ticket.apply(lambda x: x.split(" ")[0])
df_transform.head()

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,unknown,1,A/5
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,2,PC
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,unknown,1,STON/O2.
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,C,2,113803
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,unknown,1,373450


## is_alone

In [8]:
import numpy as np

df_transform["is_alone"] = np.uint8((df_transform.sibsp == 0) & (df_transform.parch == 0))

## family size

In [9]:
df_transform["family_size"] = df_transform.sibsp + df_transform.parch
df_transform[:3]

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket,is_alone,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,unknown,1,A/5,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,2,PC,0,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,unknown,1,STON/O2.,1,0


## Age * PClass

In [10]:
df_transform["age*class"] = df_transform["age"] * df_transform["pclass"]
df_transform[:3]

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket,is_alone,family_size,age*class
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,unknown,1,A/5,0,1,66.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,2,PC,0,1,38.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,unknown,1,STON/O2.,1,0,78.0


## Fare per Person

In [11]:
df_transform["fare_per_person"] = df_transform["fare"] / (df_transform["family_size"] + 1)
df_transform[:3]

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket,is_alone,family_size,age*class,fare_per_person
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,unknown,1,A/5,0,1,66.0,3.625
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,2,PC,0,1,38.0,35.64165
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,unknown,1,STON/O2.,1,0,78.0,7.925


In [12]:
df_transform.to_csv("../datasets/feature_engineering/train.csv")