# Read Dataset

In [1]:
import pandas as pd
pd.set_option("display.max_columns", None)

df = pd.read_csv("../datasets/raw/train.csv", index_col="PassengerId")
df.sample(n=3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S
65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C
177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S


# Transformation

## Rename Column

In [2]:
df.rename(
    columns={
        old: old.lower() for old in df.columns
    },
    inplace=True
)

df.columns

Index(['survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked'],
      dtype='object')

In [3]:
df_transform = df.copy()

## title

In [4]:
df_transform["title"] = df_transform.name.apply(lambda x: x.split(".")[0].split(",")[1].strip())

def replace_title(df):
    title = df["title"]
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', "Sir"]:
        return "Mr"
    elif title in ['the Countess', 'Mme', "Lady", "Dona"]:
        return "Mrs"
    elif title in ['Mlle', 'Ms']:
        return "Miss"
    elif title == "Dr":
        if df["sex"] == "male":
            return "Mr"
        else:
            return "Mrs"
    else:
        return title
    
df_transform["title"] = df_transform.apply(replace_title, axis=1)
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
211,0,3,"Ali, Mr. Ahmed",male,24.0,0,0,SOTON/O.Q. 3101311,7.05,,S,Mr
469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q,Mr
740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S,Mr


## deck

In [5]:
def deck_category(cabin):
    if isinstance(cabin, float):
        return "unknown"
    elif "A" in cabin:
        return "A"
    elif "B" in cabin:
        return "B"
    elif "C" in cabin:
        return "C"
    elif "D" in cabin:
        return "D"
    elif "E" in cabin:
        return "E"
    elif "F" in cabin:
        return "F"
    elif "G" in cabin:
        return "G"
    elif "T" in cabin:
        return "T"
    elif "F" in cabin and "E" in cabin:
        return "FE"
    elif "F" in cabin and "G" in cabin:
        return "FG"
    
df_transform["deck"] = df_transform.cabin.apply(deck_category)
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
516,0,1,"Walker, Mr. William Anderson",male,47.0,0,0,36967,34.0208,D46,S,Mr,D
869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S,Mr,unknown
177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S,Master,unknown


## fare

In [6]:
def transform_fare(fare):
    if fare <= 50:
        return 1
    elif 50 < fare <= 150:
        return 2
    else:
        return 3

df_transform["fare_category"] = df_transform.fare.apply(transform_fare)
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
377,1,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,C 7077,7.25,,S,Miss,unknown,1
220,0,2,"Harris, Mr. Walter",male,30.0,0,0,W/C 14208,10.5,,S,Mr,unknown,1
625,0,3,"Bowen, Mr. David John ""Dai""",male,21.0,0,0,54636,16.1,,S,Mr,unknown,1


In [7]:
df_transform["no_ticket"] = df_transform.ticket.apply(lambda x: x.split(" ")[0])
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
216,1,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,C,Miss,D,2,35273
725,1,1,"Chambers, Mr. Norman Campbell",male,27.0,1,0,113806,53.1,E8,S,Mr,E,2,113806
650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23.0,0,0,CA. 2314,7.55,,S,Miss,unknown,1,CA.


## is_alone

In [8]:
import numpy as np

df_transform["is_alone"] = np.uint8((df_transform.sibsp == 0) & (df_transform.parch == 0))
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket,is_alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
386,0,2,"Davies, Mr. Charles Henry",male,18.0,0,0,S.O.C. 14879,73.5,,S,Mr,unknown,2,S.O.C.,1
573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36.0,0,0,PC 17474,26.3875,E25,S,Mr,E,1,PC,1
833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C,Mr,unknown,1,2671,1


## family size

In [9]:
df_transform["family_size"] = df_transform.sibsp + df_transform.parch
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket,is_alone,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corn...",female,40.0,1,1,16966,134.5,E34,C,Mrs,E,2,16966,0,2
430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32.0,0,0,SOTON/O.Q. 392078,8.05,E10,S,Mr,E,1,SOTON/O.Q.,1,0
223,0,3,"Green, Mr. George Henry",male,51.0,0,0,21440,8.05,,S,Mr,unknown,1,21440,1,0


## age * pclass

In [10]:
df_transform["age*class"] = df_transform["age"] * df_transform["pclass"]
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket,is_alone,family_size,age*class
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S,Mr,unknown,1,312991,1,0,
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,C,2,113803,0,1,35.0
537,0,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.55,B38,S,Mr,B,1,113050,1,0,45.0


## fare per person

In [11]:
df_transform["fare_per_person"] = df_transform["fare"] / (df_transform["family_size"] + 1)
df_transform.sample(n=3)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,deck,fare_category,no_ticket,is_alone,family_size,age*class,fare_per_person
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q,Mr,F,1,383121,1,0,,7.75
65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C,Mr,unknown,1,PC,1,0,,27.7208
566,0,3,"Davies, Mr. Alfred J",male,24.0,2,0,A/4 48871,24.15,,S,Mr,unknown,1,A/4,0,2,72.0,8.05


In [12]:
df_transform.to_csv("../datasets/feature_engineering/train.csv")