# Pipeline And Model Building

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns' , None)

In [2]:
df = pd.read_csv('../data/raw/train.csv')

### 1. Train Test Split (Before Doing Anything)

**Here Stratified Shuffle Splitting is used to mainting the proportions of categories across train and test**

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits= 2 , test_size=0.2)
for trainIndices , testIndices in split.split(df , df[['Pclass' , 'Sex' , 'Survived']]):
    stratified_train_set = df.iloc[trainIndices , :]
    stratified_test_set = df.iloc[testIndices , :]

In [4]:
stratified_train_set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0000,,S
290,291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26.0,0,0,19877,78.8500,,S
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0000,C23 C25 C27,S
177,178,0,1,"Isham, Miss. Ann Elizabeth",female,50.0,0,0,PC 17595,28.7125,C49,C
98,99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34.0,0,1,231919,23.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
228,229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18.0,0,0,236171,13.0000,,S
231,232,0,3,"Larsson, Mr. Bengt Edvin",male,29.0,0,0,347067,7.7750,,S
399,400,1,2,"Trout, Mrs. William H (Jessie L)",female,28.0,0,0,240929,12.6500,,S
348,349,1,3,"Coutts, Master. William Loch ""William""",male,3.0,1,1,C.A. 37671,15.9000,,S


### Custom Functions For Pipeline

In [5]:
from sklearn.base import BaseEstimator , TransformerMixin

### 1. Creating Title

In [6]:
class createTitle(BaseEstimator , TransformerMixin):
    
    def fit(self , X , y = None):
        return self
    
    def transform(self , X , y=None):

        X_transformed = X.copy()
        # Extract the title
        X_transformed['Title'] = X_transformed['Name'].apply(lambda x : x.split(',')[1].split('.')[0].replace(' ' , '').casefold())

        # merge the synonms
        X_transformed['Title'] = X_transformed['Title'].replace(['mlle', 'ms'], 'miss')
        X_transformed['Title'] = X_transformed['Title'].replace('mme', 'mrs')

        # Create a other category
        rare_titles = ['don', 'rev', 'dr', 'major', 'lady', 'sir', 'col', 'capt', 'thecountess', 'jonkheer']
        X_transformed['Title'] = X_transformed['Title'].replace(rare_titles, 'Rare')

        return X_transformed

### 2. Creating Family Column

In [7]:
class createFamily(BaseEstimator , TransformerMixin):

    def binFamily(self , x : int) -> str:
        if x == 1:
            return 'alone'
        if x >=2 and x <= 4:
            return 'small'
        return 'large'

    def fit(self , X , y = None):
        return self
    
    def transform(self , X , y = None):

        X_transformed = X.copy()
        # Create the family size column
        X_transformed['FamilySize'] = X_transformed['SibSp'] + X_transformed['Parch'] + 1

        X_transformed['FamilyGroup'] = X_transformed['FamilySize'].apply(self.binFamily)

        return X_transformed

### 3. Creating Deck Feature

In [8]:
class createDeck(BaseEstimator , TransformerMixin):

    def fit(self, X , y=None):
        return self
    
    def transform(self , X , y = None):

        X_transformed = X.copy()

        X_transformed['Deck'] = X_transformed['Cabin'].apply(lambda x : x[0] if type(x) == str else 'U')

        return X_transformed

Creating a pipeline for above custom transformations

In [9]:
from sklearn.pipeline import Pipeline

initial_feature_creation = Pipeline(
    steps=[
        ('create_title' , createTitle()),
        ('create_family' , createFamily()),
        ('create_deck' , createDeck()),
    ]
)

In [10]:
initial_feature_creation.fit_transform(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,FamilyGroup,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,mr,2,small,U
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,mrs,2,small,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,miss,1,alone,U
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,mrs,2,small,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,mr,1,alone,U
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rare,1,alone,U
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,miss,1,alone,B
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,miss,4,small,U
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,mr,1,alone,C


### Create a column tranformer to impute values 

In [11]:
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn import set_config

imputation = ColumnTransformer(
    transformers = [
        ('impute_age' , SimpleImputer(strategy='median') , ['Age']),
        ('impute_embarked' , SimpleImputer(strategy='most_frequent') , ['Embarked'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
imputation.set_output(transform='pandas')

0,1,2
,transformers,"[('impute_age', ...), ('impute_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


### Create a column Transformer to Encode Categorical features

In [12]:
from sklearn.preprocessing import OrdinalEncoder , OneHotEncoder , StandardScaler
from sklearn.compose import ColumnTransformer


nominal_features = ['Embarked', 'Sex', 'Title', 'FamilyGroup', 'Deck']
ordinal_features = ['Pclass']
numerical_features = ['Age', 'Fare'] 


preprocessor = ColumnTransformer(
    transformers=[
        ('OHE', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features),
        ('Ordinal', OrdinalEncoder(), ordinal_features),
    ],
    remainder='passthrough', # Ensures any other columns are kept
    verbose_feature_names_out=False
)

# 3. Set the output to pandas DataFrame
preprocessor.set_output(transform='pandas')


0,1,2
,transformers,"[('OHE', ...), ('Ordinal', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropper(BaseEstimator, TransformerMixin):

    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.columns, axis=1 , errors = 'ignore')

In [14]:
cols_to_drop = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize']

final_pipeline = Pipeline(
    steps=[
        ('initial_features' , initial_feature_creation),
        ('imputation' , imputation),
        ('preprocessor' , preprocessor),
        ('column_dropper' , ColumnDropper(columns=cols_to_drop))
    ]
)


In [15]:
X_train = stratified_train_set.drop('Survived' , axis = 1)
y_train = stratified_train_set['Survived']
X_test = stratified_test_set.drop('Survived' , axis=1)
y_test = stratified_test_set['Survived']

In [16]:
X_train_transformed = final_pipeline.fit_transform(X_train)
X_test_transformed = final_pipeline.transform(X_test)



In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 200 , max_depth=None , random_state=42)
rf.fit(X_train_transformed , y_train)
rf.score(X_test_transformed , y_test)

0.8379888268156425

In [18]:
# Include the classifier as the last step
all_in_one_pipeline = Pipeline(
    steps=[
        ('initial_features' , initial_feature_creation),
        ('imputation' , imputation),
        ('preprocessor' , preprocessor),
        ('column_dropper' , ColumnDropper(columns=cols_to_drop)),
        ('classifier', RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)) # Model is inside
    ]
)

# Now the workflow is even simpler:
# 1. Fit the entire pipeline (preprocessing + model) on the training data
all_in_one_pipeline.fit(X_train, y_train)

# 2. Score the entire pipeline on the test data
accuracy = all_in_one_pipeline.score(X_test, y_test)

print(f"Final Accuracy: {accuracy}")

Final Accuracy: 0.8379888268156425




In [19]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300 , 400 , 500],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__min_samples_split': [2, 5, 10]
}

In [20]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    estimator=all_in_one_pipeline,
    param_grid=param_grid,
    cv=5,  
    scoring='accuracy',
    verbose=1, 
    n_jobs=-1 
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits




0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__max_depth': [None, 10, ...], 'classifier__min_samples_leaf': [1, 2, ...], 'classifier__min_samples_split': [2, 5, ...], 'classifier__n_estimators': [100, 200, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,steps,"[('create_title', ...), ('create_family', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('OHE', ...), ('Ordinal', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,columns,"['Name', 'Ticket', ...]"

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,20
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
# Print the best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Print the best cross-validated score
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

Best parameters found:  {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best cross-validation score: 0.8259


In [22]:
# Use the fitted grid_search object to score on the test data
final_accuracy = grid_search.score(X_test, y_test)

print(f"Final accuracy of the best model on the test set: {final_accuracy:.4f}")

Final accuracy of the best model on the test set: 0.8380




In [23]:
best_params = {
    'n_estimators': 300,
    'max_depth': 10,
    'min_samples_leaf': 4,
    'min_samples_split': 2,
    'random_state': 42 
}

final_model = Pipeline(
    steps=[
        ('initial_features', initial_feature_creation),
        ('imputation', imputation),
        ('drop_features', ColumnDropper(columns=['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize'])),
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(**best_params)) 
    ]
)

## Use the Entire Dataset to train

In [24]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

In [25]:
X_train = train.drop('Survived' , axis= 1 )
y_train = train['Survived']

In [26]:
final_model.fit(X_train , y_train)

0,1,2
,steps,"[('initial_features', ...), ('imputation', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('create_title', ...), ('create_family', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,columns,"['Name', 'Ticket', ...]"

0,1,2
,transformers,"[('OHE', ...), ('Ordinal', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
pred = final_model.predict(test)



In [28]:
final_predictions = pd.DataFrame(test['PassengerId'])

In [29]:
final_predictions['Survived'] = final_model.predict(test)



In [30]:
final_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [31]:
final_predictions.to_csv('../data/processed/predictions.csv' , index=False)

In [32]:
pd.read_csv('../data/raw/gender_submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [34]:
# Export the model
import joblib
titanic_best_model = grid_search.best_estimator_
joblib.dump(titanic_best_model , '../models/model.pkl')

['../models/model.pkl']