<a href="https://colab.research.google.com/github/harshitlohani04/medium-pipeline-tut/blob/master/medium_pipeline_tut.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Essential Imports

In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Creating a simple pipeline

In [2]:
# Assuming that we have a dataset "train_data" and "test_data" and that the features in train_data have linear relation b/w them.
from sklearn.linear_model import LinearRegression

# Steps matrix -> denotes the steps that are invloved in the Pipeline
# steps -> array of name tuples
steps = [("imputer", SimpleImputer()), ("scaler", StandardScaler()), ("linear_estimator", LinearRegression())]

# Creating the pipeline
pipe = Pipeline(steps)

In [3]:
# Visualizing the pipeline
from sklearn import set_config

set_config(display = "diagram") # for more intuitive and graphical representation of the pipeline's structure

pipe

# Creating a more advanced pipeline

Importing the dataset

In [4]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [5]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
automobile = fetch_ucirepo(id=10)

# data (as pandas dataframes)
X = automobile.data.features
y = automobile.data.targets

In [6]:
# Steps that are to be done for numerical columns
steps_num_cols = [("imputer", SimpleImputer(missing_values = np.nan, strategy = "mean")), ("scaler", StandardScaler())]

# Steps that arex to be done for the object/categorical columns
steps_obj_cols = [("imputer", SimpleImputer(strategy = "constant", fill_value = "missing")), ("one_hot_encoder", OneHotEncoder(handle_unknown = "ignore"))]

In [7]:
pipe1 = Pipeline(steps_num_cols) # Pipeline for numerical cols
pipe2 = Pipeline(steps_obj_cols) # Pipeline for obj/cat cols

In [8]:
# Combining the 2 pipelines with the help of ColumnTransformer
from sklearn.compose import ColumnTransformer

num_cols = X.select_dtypes(include = np.number).columns
cat_cols = X.select_dtypes(include = object).columns

# num_cols and cat_cols are the columns that contain numeric and categorical data respectively
combined_pipe = ColumnTransformer(transformers = [("int_cols", pipe1, num_cols), ("obj_cols", pipe2, cat_cols)])

In [9]:
from sklearn import set_config
set_config(display = "diagram")
combined_pipe

In [10]:
from sklearn.ensemble import RandomForestRegressor

# Normally we create an object of the regressor and then fit the training data
# and then predict the values. But this would be different

final_pipe = Pipeline(steps = [("preprocessor", combined_pipe), ("Estimator", RandomForestRegressor(n_estimators = 1000, max_depth = 5))])

In [11]:
final_pipe