In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv('../data/train.csv')

### Numerical features

The only issue with numerical features in NaN values

In [3]:
df[df.select_dtypes('number').columns]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,1,1,,2,0,27.14
1,1,0,3,,0,0,13.35
2,2,0,3,0.33,1,2,71.29
3,3,0,3,19.00,0,0,13.04
4,4,1,3,25.00,0,0,7.76
...,...,...,...,...,...,...,...
99995,99995,1,2,62.00,0,0,14.86
99996,99996,0,2,66.00,0,0,11.15
99997,99997,0,3,37.00,0,0,9.95
99998,99998,0,3,51.00,0,1,30.92


In [4]:
# delete id
df = df.drop('PassengerId', axis=1)

In [5]:
df[df.select_dtypes('number').columns].isna().sum()

Survived       0
Pclass         0
Age         3292
SibSp          0
Parch          0
Fare         134
dtype: int64

In [6]:
def fill_na_num(df):
    nan_features = []
    nan_df = df[df.select_dtypes('number').columns].isna().sum()
    
    for i in range(len(nan_df)):
        if nan_df.values[i] != 0:
            nan_features.append(nan_df.index[i])
            
    for feature in nan_features:
        df[feature] = df[feature].fillna(df[feature].mean())
    return df

df = fill_na_num(df)

In [7]:
df[['Age', 'Fare']]

Unnamed: 0,Age,Fare
0,38.355472,27.14
1,38.355472,13.35
2,0.330000,71.29
3,19.000000,13.04
4,25.000000,7.76
...,...,...
99995,62.000000,14.86
99996,66.000000,11.15
99997,37.000000,9.95
99998,51.000000,30.92


## Object features

In [8]:
df[df.select_dtypes('object').columns]

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Oconnor, Frankie",male,209245,C12239,S
1,"Bryan, Drew",male,27323,,S
2,"Owens, Kenneth",male,CA 457703,,S
3,"Kramer, James",male,A. 10866,,S
4,"Bond, Michael",male,427635,,S
...,...,...,...,...,...
99995,"Bell, Adele",female,PC 15008,D17243,C
99996,"Brown, Herman",male,13273,,S
99997,"Childress, Charles",male,,,S
99998,"Caughlin, Thomas",male,458654,,S


**Cabin feature**

In [9]:
df.Cabin.value_counts()

Cabin
C19828    6
C6906     5
C13699    5
C14534    5
B11110    5
         ..
A21234    1
A4905     1
C7208     1
A5788     1
D17243    1
Name: count, Length: 26992, dtype: int64

In [10]:
def cabin_processor(df):
    df['CabinChar'] = df.Cabin.str[:1]
    df['CabinNum'] = df.Cabin.str[1:]
    
    # Replace NaN values with -1
    df['CabinNum'] = df.CabinNum.fillna(-1)
    df['CabinNum'] = df.CabinNum.astype('int')
    
    return df.drop('Cabin', axis=1)

df = cabin_processor(df)

In [11]:
df[['CabinChar', 'CabinNum']]

Unnamed: 0,CabinChar,CabinNum
0,C,12239
1,,-1
2,,-1
3,,-1
4,,-1
...,...,...
99995,D,17243
99996,,-1
99997,,-1
99998,,-1


**Name feature**

In [12]:
df.Name.value_counts()

Name
Johnson, John        31
Smith, James         24
Smith, John          24
Johnson, William     22
Smith, Michael       20
                     ..
Davis, Andre          1
Rosenblum, Ronald     1
Landry, Ernesto       1
Mcdonald, Frances     1
Enciso, Tyler         1
Name: count, Length: 92144, dtype: int64

In [13]:
# split the name feature into first name and second name
def name_processor(df):
    df['firstName'] = df.Name.str.split(',').str[-1]
    df['secondName'] = df.Name.str.split(',').str[0]
    
    return df.drop('Name', axis=1)

df = name_processor(df)

In [14]:
df[['firstName', 'secondName']]

Unnamed: 0,firstName,secondName
0,Frankie,Oconnor
1,Drew,Bryan
2,Kenneth,Owens
3,James,Kramer
4,Michael,Bond
...,...,...
99995,Adele,Bell
99996,Herman,Brown
99997,Charles,Childress
99998,Thomas,Caughlin


**Ticket feature**

In [15]:
df.Ticket.str[:2].value_counts()

Ticket
PC    6375
A.    3932
43    3012
44    2961
42    2802
      ... 
54      41
SW      38
53      33
W/      28
A4      19
Name: count, Length: 111, dtype: int64

In [16]:
def ticket_processor(df):
    # take only two caracter of Ticket
    df['Ticket'] = df.Ticket.str[:2]
    return df

df = ticket_processor(df)

In [17]:
df[['Ticket']]

Unnamed: 0,Ticket
0,20
1,27
2,CA
3,A.
4,42
...,...
99995,PC
99996,13
99997,
99998,45


**NaN values**

In [18]:
df[df.select_dtypes('object').columns].isna().sum()

Sex               0
Ticket         4623
Embarked        250
CabinChar     67866
firstName         0
secondName        0
dtype: int64

In [19]:
# fillna embarked & Ticket features

df['Embarked'] = df.Embarked.fillna(df.Embarked.mode().iloc[0])
df['Ticket'] = df.Ticket.fillna(df.Ticket.mode().iloc[0])

In [20]:
df.CabinChar.value_counts()

CabinChar
C    11825
B     7439
A     6307
D     3637
E     1749
F      663
G      482
T       32
Name: count, dtype: int64

In [21]:
def cabinChar_processor(df):
    df.loc[df['CabinChar'].isnull(), 'CabinChar'] = np.random.choice(['A','B','C','D'], 
                                                                     size=df['CabinChar'].isnull().sum())
    
    return df

df = cabinChar_processor(df) 

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Survived    100000 non-null  int64  
 1   Pclass      100000 non-null  int64  
 2   Sex         100000 non-null  object 
 3   Age         100000 non-null  float64
 4   SibSp       100000 non-null  int64  
 5   Parch       100000 non-null  int64  
 6   Ticket      100000 non-null  object 
 7   Fare        100000 non-null  float64
 8   Embarked    100000 non-null  object 
 9   CabinChar   100000 non-null  object 
 10  CabinNum    100000 non-null  int64  
 11  firstName   100000 non-null  object 
 12  secondName  100000 non-null  object 
dtypes: float64(2), int64(5), object(6)
memory usage: 9.9+ MB


**Encoding**

In [23]:
df[df.select_dtypes('object').columns].sample(5)

Unnamed: 0,Sex,Ticket,Embarked,CabinChar,firstName,secondName
44983,female,PC,S,C,Deborah,Mcneill
53650,male,43,S,A,Charles,Lin
93112,female,22,S,A,Sara,Donnelly
22770,male,S.,S,D,Herman,Ellis
38272,male,PC,S,D,Jason,Melchior


In [24]:
def encoding_processor(df):
    encoding_features = df.select_dtypes('object').columns
    encoder = OrdinalEncoder()
    
    df[encoding_features] = encoder.fit_transform(df[encoding_features])
    return df

df = encoding_processor(df)

**Save new data**

In [25]:
folder_path = '../data/'
df.to_csv(folder_path + 'train_df_cleaned.csv', index=False)