<a href="https://colab.research.google.com/github/ichhitsapkota143/Machine-Learning/blob/main/Day21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Pipelines**
  * It is a mechanism which chains together multiple steps so that the output of each step is used as input to the next step.
  * It makes it easy to apply the same preprocessing to train and test.


# **🔧 Why use a pipeline?**

1.   Keeps code clean and organized.
2.   Avoids data leakage by applying transformations only on training data.
3. Makes the model easily reproducible
4. Helps in hyperparameter tuning using tools like GridSearchCV

# ***Titanic Code without Pipeline***

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [4]:
df=pd.read_csv('/content/train (3).csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
#step-1 train_test split
X_train,X_test,y_train,y_split=train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [7]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [8]:
#step-2 applying imputation(filling null value with mean for age and most repeated value for embarked)
si_age=SimpleImputer()
si_embarked=SimpleImputer(strategy='most_frequent')
X_train_age=si_age.fit_transform(X_train[['Age']])
X_train_embarked=si_embarked.fit_transform(X_train[['Embarked']])
X_test_age=si_age.fit_transform(X_test[['Age']])
X_test_embarked=si_embarked.fit_transform(X_test[['Embarked']])

In [9]:
#step 3 OneHotEncoding for Sex and Embarked
ohe_sex=OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_embarked=OneHotEncoder(sparse_output=False,handle_unknown='ignore')

X_train_sex=ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked=ohe_embarked.fit_transform(X_train_embarked)

X_test_sex=ohe_sex.fit_transform(X_test[['Sex']])
X_test_embarked=ohe_embarked.fit_transform(X_test_embarked)

In [10]:
X_train_rem=X_train.drop(columns=['Sex','Age','Embarked'])
X_test_rem=X_test.drop(columns=['Sex','Age','Embarked'])

In [11]:
X_train_transformed=np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)
X_test_transformed=np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)

In [12]:
clf=DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [13]:
X_train_transformed.shape

(712, 10)

In [14]:
y_pred=clf.predict(X_test_transformed)
y_pred

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0])

In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_split,y_pred)

0.7374301675977654

dumping the above model for future use

In [17]:
import pickle
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))

loading the model from the model directory

In [18]:
#importing values
ohe_sex=pickle.load(open('models/ohe_sex.pkl','rb'))
ohe_embarked=pickle.load(open('models/ohe_embarked.pkl','rb'))
clf=pickle.load(open('models/clf.pkl','rb'))

In [19]:
#testing
test_input=np.array([2,'male',31.0,0,0,10.5,'S'],dtype=object).reshape(1,7)

In [20]:
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [21]:
test_input_sex=ohe_sex.transform(test_input[:,1].reshape(1,1))



In [22]:
test_input_embarked=ohe_embarked.transform(test_input[:,1].reshape(1,1))

In [23]:
test_input_age=test_input[:,2].reshape(1,1)

In [24]:
test_input_transformed=np.concatenate((test_input[:,[0,3,4,5]],test_input_age,test_input_sex,test_input_embarked),axis=1)

In [25]:
test_input_transformed.shape

(1, 10)

In [26]:
clf.predict(test_input_transformed)

array([1])

In [27]:
#this predicts that the passenger will die

In [28]:
#if we dont use pipeline, we have to repeate all the proces of training the model to test the model

# ***Using Pipelining Concept***

In [29]:
df=pd.read_csv('/content/train (3).csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [32]:
#step-1 train/test split
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [33]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [34]:
y_train.sample(5)

Unnamed: 0,Survived
135,0
3,1
386,0
735,0
271,1


In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [42]:
#step-2 imputation transformer
trf1=ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]), #2 is index of age
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6]) #6 is index of embarked
],remainder='passthrough')

In [43]:
#step-2 OneHotEncoding
trf2=ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
    #1 is index of sex and 6 is that of embarked
],remainder='passthrough')

In [46]:
#step-3 scaling
trf3=ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])
#minmax for all columns, slice function is used to apply transformation
#here minmaxscaler is applied for all 0 to 10 index after applying above transformer
#0 included 10 excluded

In [52]:
#step-4 Feature Scaling
from sklearn.feature_selection import SelectKBest,chi2
trf4=SelectKBest(score_func=chi2,k=8)

In [53]:
#step-5 train the model
trf5=DecisionTreeClassifier()

# **Create Pipeline**

In [57]:
from sklearn.pipeline import Pipeline
pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])
#Pipeline([(transformation_name,transformer_object)])

# **Pipeline vs make_pipeline**
In scikit-learn, both Pipeline and make_pipeline are used to build machine learning pipelines, but they differ slightly in how they are constructed and when to use each.


  * Pipeline: You manually assign names to each step.

  * make_pipeline: It automatically assigns names based on the class names.

In [59]:
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [61]:
#train model
pipe.fit(X_train,y_train)
#pipe.fit_transform() is used when we are not training the algorithm

In [62]:
#Explore the pipeline
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x7a051236cb80>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [70]:
pipe.named_steps['columntransformer-1']

In [71]:
pipe.named_steps['columntransformer-1'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [74]:
pipe.named_steps['columntransformer-1'].transformers_[1][1]

In [75]:
pipe.named_steps['columntransformer-1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [81]:
pipe.named_steps['columntransformer-1'].transformers_[0][1].statistics_

array([29.49884615])

In [82]:
#Display pipeline
from sklearn import set_config
set_config(display='diagram')

In [83]:
#predict
y_pred=pipe.predict(X_test)

In [86]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

# **Exporting Pipelining**

In [87]:
#using pickel
import pickle

In [96]:
pickle.dump(pipe,open('pipe.pkl','wb'))

In [98]:
pipe=pickle.load(open('pipe.pkl','rb'))

In [99]:
#assume user input
test_input=np.array([2,'male',31.0,0,0,10.5,'S'],dtype=object).reshape(1,7)

In [100]:
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [102]:
pipe.predict(test_input)
#0 means the person died



array([0])