# Initial Setup

In [1]:
# This is needed only for the purpose of the notebook
!pip install ipytest




[notice] A new release of pip is available: 24.2 -> 24.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Importing required libraries
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pytest
import ipytest
ipytest.autoconfig()

In [3]:
# Getting the data
iris = datasets.load_iris()

In [4]:
# Simple setup in the data
iris_df = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_df['target'] = iris.target

### Setting up the classes to build a simple model

In [5]:
class SimplePipeline:
    def __init__(self):
        self.frame = None
        # Each value is None when we instantiate the class
        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None
        self.model = None
        self.load_dataset()
    
    def load_dataset(self):
        """Loading the dataset, and make the train, test, split."""
        dataset = datasets.load_iris()
        
        # Removing the units (cm) from the headers
        self.feature_names = [fn[:-5] for fn in dataset.feature_names]
        self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)
        self.frame['target'] = dataset.target
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)
        
    def train(self, algorithm=LogisticRegression):
        
        self.model = algorithm(solver='lbfgs', multi_class='auto')
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        return self.model.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Execution method for running the pipeline several times."""
        self.load_dataset()
        self.train()

## Adding a Preprocessing step

In [6]:
class PipelineWithFeatureEngineering(SimplePipeline):
    def __init__(self):
        # Calling the inherit method SimplePipeline __init__ first.
        super().__init__()
        
        # Standardizing the variables in the dataset.
        self.scaler = StandardScaler()
        # Training the pipeline
        self.scaler.fit(self.X_train)
    
    def apply_scaler(self):
        # Scaling training and testing data with mean 0 and variance 1.
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
        
    def predict(self, input_data):
        # Applying the scaler before making the predictions.
        scaled_input_data = self.scaler.transform(input_data)
        return self.model.predict(scaled_input_data)
                  
    def run_pipeline(self):
        self.load_dataset()
        self.apply_scaler()
        self.train()

In [8]:
pipeline = PipelineWithFeatureEngineering()
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()
print(f'The Accuracy of the model is: {accuracy_score}')

The Accuracy of the model is: 0.9591836734693877




# Testing

In [9]:
@pytest.fixture
def pipeline():
    pl = PipelineWithFeatureEngineering()
    pl.load_dataset()
    return pl

### Creating the test

In [8]:
%%ipytest

def test_scaler_preprocessing_brings_x_train_mean_near_zero(pipeline):
    original_mean = pipeline.X_train.stack().mean()
    
    pipeline.apply_scaler()
    
    assert original_mean > pipeline.X_train.mean()
    assert np.isclose(pipeline.X_train.mean(), 0.0, atol=1e-3)

    print(f'The mean of the original X train is: {original_mean}')
    print(f'The mean of the transformed X train is: {pipeline.X_train.mean()}')

def test_scaler_preprocessing_brings_x_train_std_near_one(pipeline):
    pipeline.apply_scaler()
    
    assert np.isclose(pipeline.X_train.std(), 1.0, atol=1e-3)
    print(f'The SD of the transformed X train is : {pipeline.X_train.std()}')

..                                                                                           [100%]
2 passed in 0.43s


### Making the tests fail

In [9]:
%%ipytest

def test_scaler_preprocessing_brings_x_train_mean_near_zero(pipeline):
    original_mean = pipeline.X_train.stack().mean()
    
    pipeline.apply_scaler()

    # Changing the assertion, so it will fail
    assert original_mean < pipeline.X_train.mean()

    # Changing the value in isclose to make it fail
    assert not np.isclose(pipeline.X_train.mean(), 1.0, atol=1e-3)

F                                                                                            [100%]
_____________________ test_scaler_preprocessing_brings_x_train_mean_near_zero _____________________

pipeline = <__main__.PipelineWithDataEngineering object at 0x000001C183E5A210>

    def test_scaler_preprocessing_brings_x_train_mean_near_zero(pipeline):
        original_mean = pipeline.X_train.stack().mean()
    
        pipeline.apply_scaler()
    
        # Cambiamos la aserción para que falle
>       assert original_mean < pipeline.X_train.mean()
E       assert 3.5889423076923075 < -5.978123978750843e-17
E        +  where -5.978123978750843e-17 = <built-in method mean of numpy.ndarray object at 0x000001C183EDB870>()
E        +    where <built-in method mean of numpy.ndarray object at 0x000001C183EDB870> = array([[ 0.04967733, -2.13501958,  0.56638299,  0.24421145],\n       [ 1.52580366,  0.49581023,  1.16990584,  0.6583091...    [-0.19634373, -1.08268765, -0.03713987, -0.16988623],\

In [10]:
%%ipytest

def test_scaler_preprocessing_brings_x_train_std_near_one(pipeline):
    # Adding huge variation in the data
    pipeline.X_train *= 1000

    pipeline.apply_scaler()

    # Testing with the original tolerance
    assert np.isclose(pipeline.X_train.std(), 1.0, atol=1e-3)

[31mF[0m[31m                                                                                            [100%][0m
[31m[1m______________________ test_scaler_preprocessing_brings_x_train_std_near_one ______________________[0m

pipeline = <__main__.PipelineWithDataEngineering object at 0x000002A4464BAE10>

    [0m[94mdef[39;49;00m [92mtest_scaler_preprocessing_brings_x_train_std_near_one[39;49;00m(pipeline):[90m[39;49;00m
        [90m# Adding huge variation in the data[39;49;00m[90m[39;49;00m
        pipeline.X_train *= [94m1000[39;49;00m[90m[39;49;00m
    [90m[39;49;00m
        pipeline.apply_scaler()[90m[39;49;00m
    [90m[39;49;00m
        [90m# Testing with the original tolerance[39;49;00m[90m[39;49;00m
>       [94massert[39;49;00m np.isclose(pipeline.X_train.std(), [94m1.0[39;49;00m, atol=[94m1e-3[39;49;00m)[90m[39;49;00m
[1m[31mE       assert np.False_[0m
[1m[31mE        +  where np.False_ = <function isclose at 0x000002A427256230>(np.flo