## PIPELINE

A pipeline is a way to connect multiple steps of a machine learning process (like cleaning data, transforming it, and training a model) into one single object.



##### uses of pipeline

Makes code cleaner

Avoids mistakes

Easy to reuse or share

Works well with model tuning tools like GridSearchCV



In [53]:
## Data --> separate (categorical, Numerical)  -->
# Categorical data => SimpleImputer fill --> Encode
#numerical data ==> SimpleImputer --> Standardize

#apply == Algorithm(Logistic Regression)

In [42]:
import numpy as np
import pandas as pd 

In [43]:
df = pd.read_csv("C:\\Users\\HP\\OneDrive\\Desktop\\DATASET\\covid_toy - covid_toy.csv")

In [44]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [45]:
from sklearn.compose import ColumnTransformer #Combines the numeric and categorical transformers:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #Fills missing categorical values with the most frequent value.
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.linear_model import LogisticRegression# Logistic Regression is a machine learning algorithm used to predict categories,like yes / no ,pass / fail etc
from sklearn.model_selection import train_test_split

In [46]:
X = df.drop(columns = ['has_covid'])
y = df['has_covid']
X_train , X_test , y_train ,y_test , = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [47]:
# define the columns that need to be processesd
categorical_features = ['gender' , 'city']
numeric_features = ['age' , 'fever']


In [48]:
# create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer' , SimpleImputer(strategy='mean')),
    ('scaler' , StandardScaler())
    
])

categorical_transformer = Pipeline(steps=[
    ('imputer' , SimpleImputer(strategy='most_frequent')),
    ('onehot' , OneHotEncoder(handle_unknown = 'ignore'))
    
])

# combine transformer 
preprocessor = ColumnTransformer(
    transformers = [
        ('num' , numeric_transformer , numeric_features),
        ('cat' , categorical_transformer , categorical_features)
    ])

# create the pipeline
clf = Pipeline(steps = [('preprocessor' , preprocessor),
                       ('classifier' , LogisticRegression())])

# train the model
clf.fit(X_train , y_train)

#evaluate the model
y_pred = clf.predict(X_test)

In [49]:
#y_pred

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
acc = accuracy_score(y_test , y_pred)
acc

0.65

In [24]:

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [25]:
df = pd.read_csv("C:\\Users\\HP\\OneDrive\\Desktop\\DATASET\\Social_Network_Ads - Social_Network_Ads.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [30]:
df = df.drop(columns = ['User ID' , 'Gender'])

In [31]:
x = df.drop(columns = ['Purchased'] , axis = 1)
y = df['Purchased']

In [33]:
x_train , x_test , y_train ,y_test , = train_test_split(x,y,test_size = 0.2, random_state = 42) 

pipe = Pipeline([
    ('scaler' , StandardScaler()),
    ('pca' , PCA(n_components = 2)),
    ('classifier' , RandomForestClassifier(n_estimators = 100 , random_state = 42))
])

In [34]:
pipe

In [35]:
pipe.fit(x_train , y_train)

In [36]:
y_pred = pipe.predict(x_test)

In [39]:
acc = accuracy_score(y_test , y_pred)
print(acc)
acc

0.8875


0.8875