In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder

In [None]:
df = pd.read_csv('../data/train.csv')

### Numerical features

The only issue with numerical features in NaN values

In [None]:
df[df.select_dtypes('number').columns]

In [None]:
# delete id
df = df.drop('PassengerId', axis=1)

In [None]:
df[df.select_dtypes('number').columns].isna().sum()

In [None]:
def fill_na_num(df):
    nan_features = []
    nan_df = df[df.select_dtypes('number').columns].isna().sum()
    
    for i in range(len(nan_df)):
        if nan_df.values[i] != 0:
            nan_features.append(nan_df.index[i])
            
    for feature in nan_features:
        df[feature] = df[feature].fillna(df[feature].mean())
    return df

df = fill_na_num(df)

In [None]:
df[['Age', 'Fare']]

## Object features

In [None]:
df[df.select_dtypes('object').columns]

**Cabin feature**

In [None]:
df.Cabin.value_counts()

In [None]:
def cabin_processor(df):
    df['CabinChar'] = df.Cabin.str[:1]
    df['CabinNum'] = df.Cabin.str[1:]
    
    # Replace NaN values with -1
    df['CabinNum'] = df.CabinNum.fillna(-1)
    df['CabinNum'] = df.CabinNum.astype('int')
    
    return df.drop('Cabin', axis=1)

df = cabin_processor(df)

In [None]:
df[['CabinChar', 'CabinNum']]

**Name feature**

In [None]:
df.Name.value_counts()

In [None]:
# split the name feature into first name and second name
def name_processor(df):
    df['firstName'] = df.Name.str.split(',').str[-1]
    df['secondName'] = df.Name.str.split(',').str[0]
    
    return df.drop('Name', axis=1)

df = name_processor(df)

In [None]:
df[['firstName', 'secondName']]

**Ticket feature**

In [None]:
df.Ticket.str[:2].value_counts()

In [None]:
def ticket_processor(df):
    # take only two caracter of Ticket
    df['Ticket'] = df.Ticket.str[:2]
    return df

df = ticket_processor(df)

In [None]:
df[['Ticket']]

**NaN values**

In [None]:
df[df.select_dtypes('object').columns].isna().sum()

In [None]:
# fillna embarked & Ticket features

df['Embarked'] = df.Embarked.fillna(df.Embarked.mode().iloc[0])
df['Ticket'] = df.Ticket.fillna(df.Ticket.mode().iloc[0])

In [None]:
df.CabinChar.value_counts()

In [None]:
def cabinChar_processor(df):
    df.loc[df['CabinChar'].isnull(), 'CabinChar'] = np.random.choice(['A','B','C','D'], 
                                                                     size=df['CabinChar'].isnull().sum())
    
    return df

df = cabinChar_processor(df) 

In [None]:
df.info()

**Encoding**

In [None]:
df[df.select_dtypes('object').columns].sample(5)

In [None]:
def encoding_processor(df):
    encoding_features = df.select_dtypes('object').columns
    encoder = OrdinalEncoder()
    
    df[encoding_features] = encoder.fit_transform(df[encoding_features])
    return df

df = encoding_processor(df)

**Save new data**

In [None]:
folder_path = '../data/'
df.to_csv(folder_path + 'train_df_cleaned.csv', index=False)