<a href="https://colab.research.google.com/github/harshitlohani04/regression-using-pipeline/blob/master/ML_pipeline_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task : To create the ML pipeline for regression model
Step 1: Importing the important libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import set_config

Step 2: Import the dataset

In [2]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
automobile = fetch_ucirepo(id=10)

# data (as pandas dataframes)
X = automobile.data.features
y = automobile.data.targets

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train.shape, X_train.shape

((164, 1), (164, 25))

# Step 3: Creating the pipeline

Creating a custom transformer class for dropping the non-important columns from the dataset

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class drop_cols(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.dropped_columns_ = []

    def fit(self, X, y=None):
        # Reset dropped columns during fit
        self.dropped_columns_ = []

        # Identify columns to drop based on threshold
        X = pd.DataFrame(X)
        for col in X.columns:
            max_val = X[col].value_counts(normalize=True).max()
            if max_val > self.threshold:
                self.dropped_columns_.append(col)

        return self

    def transform(self, X):
        # Drop identified columns from the DataFrame
        X_new = pd.DataFrame(X)
        X_new = X_new.drop(columns=self.dropped_columns_, axis=1)
        return X_new

    def get_dropped_columns(self):
        # Method to get the list of dropped columns
        return self.dropped_columns_

Creating a Custom Label Encoder to encode the data

In [8]:
class custom_LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.encoded_cols = []

    def fit(self, X, y=None):
        return self

    def transform(self, X):
      # We have to convert the type of the dataset to dataframe because sklearn internally changes the type of the dataset to numpy
        X_new = X.copy()
        X_new = pd.DataFrame(X_new)
        for col in X_new.select_dtypes(include = object).columns:
            X_new[col] = self.label_encoder.fit_transform(X_new[col])
        return X_new

Using the drop_cols and custom_LabelEncoder Custom transformers

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# create an object of the above class
dropCols = drop_cols()

# Creating a function that would contain the object if the above class so that we can keep track of the dropped columns.
def create_pipeline(dataset):
    transformedDs = dropCols.fit_transform(dataset)
    numCols = transformedDs.select_dtypes(include = np.number).columns
    objCols = transformedDs.select_dtypes(include = object).columns

    # Creating the pipeline inside this method itself
    steps_int = [("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")), ("scaler", StandardScaler())]
    steps_obj = [("imputer", SimpleImputer(strategy = "constant", fill_value="missing")), ("custom_labelEncoder", custom_LabelEncoder())]

    pipe1 = Pipeline(steps_int)
    pipe2 = Pipeline(steps_obj)

    # Combining the 2 pipelines
    ct = ColumnTransformer([("int", pipe1, list(numCols)),
                            ("obj", pipe2, list(objCols))])

    return ct, dropCols.get_dropped_columns()

Visualizing the pipeline

In [10]:
from sklearn import set_config
set_config(display='diagram')

preprocessor_pipeline , colsDrop= create_pipeline(X_train)
preprocessor_pipeline


Creating another custom Transformer that would apply the transforms to the data

In [19]:
class DynamicPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.dropped_cols = []

    def fit(self, X, y=None):
        self.pipeline, self.droppped_cols = create_pipeline(X)
        self.pipeline.fit(X)
        return self

    def transform(self, X):
        X_new = X.drop(columns = self.dropped_cols, axis = 1)
        return self.pipeline.transform(X_new)


Final pipe

In [20]:
from sklearn.ensemble import RandomForestRegressor
final_pipe = Pipeline([("dynamic_preprocessor", DynamicPreprocessor()), ("regressor", RandomForestRegressor())])
final_pipe

Predicting the values

In [23]:
final_pipe.fit(X_train, y_train)
y_pred = final_pipe.predict(X_test)
y_pred

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


array([ 0.54,  1.75,  0.14,  2.47, -0.58,  1.09,  0.06,  0.  ,  0.1 ,
        1.27,  0.26,  1.25, -0.01, -0.65, -0.56,  0.09,  1.34,  2.75,
        0.59,  0.2 ,  0.12,  0.04,  1.46,  1.59,  0.99,  1.48,  1.98,
       -0.65,  0.78, -0.84, -0.54,  1.  ,  0.1 , -0.49,  1.82,  1.2 ,
        0.18,  2.82,  0.66,  2.75, -0.03])

Evaluating the RMSE score for the predictions

In [24]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

0.5807312463504998