In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

titanic_data = pd.read_csv("titanic_dataset.csv")

## Diving dataset into train and test data as 80% and 20%

In [3]:
X = titanic_data.drop('Survived', axis=1)
y = titanic_data.Survived
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

## Seperating columns based on data type

In [4]:
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64','float64']]
categorical_cols = [col for col in X.columns if X[col].dtype =='object']

## Handling missing values

In [5]:
numerical_transform = SimpleImputer(strategy='mean')
categorical_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('oneHot', OneHotEncoder(handle_unknown='ignore'))
])

## Merging different type of columns 

In [6]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transform, categorical_cols),
    ('num', numerical_transform, numerical_cols)
])

## Pipeline creation and train

In [7]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

my_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
my_model.fit(X_train, y_train)

## Predict and MAE

In [8]:
pred = my_model.predict(X_valid)
print("Mean absolute error is ",mean_absolute_error(pred, y_valid))

Mean absolute error is  0.17
