## Column Transformer

Column tranformers are used to apply different preprocessing to different columns


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(r"C:\Users\sunny\Desktop\bitly\titanic_train.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
# we select four columns as under
d_abr = df.loc[:, ['Fare', 'Embarked', 'Sex', 'Age']]

In [7]:
d_abr

Unnamed: 0,Fare,Embarked,Sex,Age
0,7.2500,S,male,22.0
1,71.2833,C,female,38.0
2,7.9250,S,female,26.0
3,53.1000,S,female,35.0
4,8.0500,S,male,35.0
...,...,...,...,...
886,13.0000,S,male,27.0
887,30.0000,S,female,19.0
888,23.4500,S,female,
889,30.0000,C,male,26.0


In [32]:
# check for missing values
d_abr.isna().sum()

Fare          0
Embarked      2
Sex           0
Age         177
dtype: int64

In [43]:
d_abr.tail()

Unnamed: 0,Fare,Embarked,Sex,Age
886,13.0,S,male,27.0
887,30.0,S,female,19.0
888,23.45,S,female,
889,30.0,C,male,26.0
890,7.75,Q,male,32.0


In [47]:
d_abr.loc[d_abr.Embarked != 'NaN', :].shape

(891, 4)

In [62]:
# drop the two Nan values in the Embarked column manually
X =  d_abr.loc[d_abr.Embarked.notna(), :]

In [55]:
X.isna().sum()

Fare          0
Embarked      0
Sex           0
Age         177
dtype: int64

In [56]:
# imports
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [57]:
# making instances of the classes imported 
ohe = OneHotEncoder()
imp = SimpleImputer() # Using default parameters 


In [63]:
# make instances of the make_column_transformer class and pass the other two calsses 
# in the form of tuples each tuple has the name of the object and a list of columns 
# we want to one hot encode the Embarked and sex columns
# so we pass it in a list inside the ohe tuple
# and want to impute the age so pased it in a list inside the imp tuple
ct = make_column_transformer(
(ohe, ['Embarked','Sex']), 
(imp, ['Age']),
remainder = 'passthrough'
)

In [60]:
# by using fit_transform the function learns the behaviour we want and transform transforms it
ct.fit_transform(X)

array([[ 0.       ,  0.       ,  1.       , ...,  1.       , 22.       ,
         7.25     ],
       [ 1.       ,  0.       ,  0.       , ...,  0.       , 38.       ,
        71.2833   ],
       [ 0.       ,  0.       ,  1.       , ...,  0.       , 26.       ,
         7.925    ],
       ...,
       [ 0.       ,  0.       ,  1.       , ...,  0.       , 29.6420927,
        23.45     ],
       [ 1.       ,  0.       ,  0.       , ...,  1.       , 26.       ,
        30.       ],
       [ 0.       ,  1.       ,  0.       , ...,  1.       , 32.       ,
         7.75     ]])