# Univariate Outliers Detection

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np


## load data

In [27]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data 
target = iris.target 
names = iris.target_names
data = pd.DataFrame(X, columns=iris.feature_names)
data['species'] = iris.target
data['species'] = data['species'].replace(to_replace= [0, 1, 2], value = ['setosa', 'versicolor', 'virginica'])
data.shape

(150, 5)

# Univariate OUTLIERS detection for a df

In [62]:
## remove outliers of a 1D array according to the Inter Quartile Range (IQR)
def remove_outliers_IQR(v:np.array, verbose:bool = False)->np.array:
    """
    Remove outliers of a 1D array according to the Inter Quartile Range (IQR).
    v -- array of values to be analyzed.
    verbose -- display extra information (default, False).
    return -- array of values after removing outliers.
    """
    # estimate boundary thresholds
    Q1 = np.quantile(v,0.25)
    Q3 = np.quantile(v,0.75)
    IQR = Q3 - Q1
    t_lower = Q1 - 1.5*IQR
    t_upper = Q3 + 1.5*IQR
    # display
    if verbose:
        print('Thresholds: lower = %.5f / upper = %.5f'%(t_lower, t_upper))
    # remove values outside of these thresholds and return
    v[v < t_lower] = np.nan
    v[v > t_upper] = np.nan
    # return
    return v


## remove outliers of a 1D array according to standard deviation rule of Normal Distribution
def remove_outliers_Z(v:np.array, threshold:int = 3, verbose:bool = False)->np.array:
    """
    Remove outliers of a 1D array according to standard deviation rule of Normal Distribution.
    v -- array of values to be analyzed.
    threshold -- value to be used to decide if a value is a outlier or not (default, 3 sigmas).
    verbose -- display extra information (default, False).
    return -- array of values after removing outliers.
    """

    # estimate z score
    v_mean = np.mean(v)
    v_std = np.std(v)
    z_scores = [(i - v_mean) / v_std for i in var]
    # remove outilers
    v[v < -threshold] = np.nan
    v[v > threshold] = np.nan
    # return
    return v  


## univariante outliers detection for all numerical variables in a df
def univariate_outliers_detection(data:pd.DataFrame, 
                                  is_remove:bool = True,  
                                  methodology:'function' = remove_outliers_IQR, 
                                  verbose:bool = False)->pd.DataFrame:
    """
    Univariante outliers detection for all numerical variables in a df.
    data -- dataframe to be analyzed.
    is_remove -- if removing outliers or just detect (default, True).
    methodology -- function of method to be used to remove / detect outliers (default, remove_outliers_IQR()).
    verbose -- display extra information (default, False).
    return -- df of values without outliers or a mask with detected outliers.
    """
    # copy data
    df = data.copy()
    # columns of numerical variables
    cols_num = df.select_dtypes(include=['float64', 'int64']).columns.values 
    # initialize if just detection
    if not is_remove:
        df_mask = pd.DataFrame(np.zeros(df.shape, dtype=bool), columns = df.columns)
    # loop of numerical columns
    for col in cols_num:
        # get data
        v = df[col].values
        ni = np.sum(np.isnan(v))
        # outliers detection
        v_cleaned = methodology(v)
        nf = np.sum(np.isnan(v_cleaned))
        # count detected outliers
        noutliers = nf - ni
        # validate if outliers was found
        if noutliers > 0:
            # display
            if verbose:
                print(f'In "{col}" was detected {noutliers} outliers.')
            # if removing
            if is_remove:
                df[col] = v_cleaned
            # if just detection
            else:
                i_outliers = np.where(np.isnan(v_cleaned))[0]
                df_mask.loc[i_outliers,col] = True
        # clean
        del v, v_cleaned
    # return 
    if is_remove:
        return df
    else:
        return df_mask

In [63]:
_ = univariate_outliers_detection(data, is_remove = True, verbose = True)

In "sepal width (cm)" was detected 4 outliers.
