# Handling outliers

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

**An outlier is a data point that is significantly different from the remaining data.** 


## Trimming outliers from the dataset

**Trimming, or truncating, is the process of removing observations that show outliers in one
or more variables in the dataset.** 



In [None]:
boston = pd.read_csv("data/boston.csv")
boston.head()

In [None]:
# The outliers are the asterisks sitting outside the whiskers, which delimit the interquartile range proximity rule boundaries:
sns.boxplot(boston['RM'])
plt.show()

In [None]:
def find_boundaries(df, variable, distance, rule="inter-quartile"):
    if rule == "inter-quartile":
        IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
        lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
        upper_boundary = df[variable].quantile(0.75) + (IQR * distance)
        return upper_boundary, lower_boundary
    elif rule == "meanstddev":
        lower_boundary = df[variable].mean() - (df[variable].std() * distance)
        upper_boundary = df[variable].mean() + (df[variable].std() * distance)
        return upper_boundary, lower_boundary

In [None]:
RM_upper_limit, RM_lower_limit = find_boundaries(boston, 'RM', 1.5)

In [None]:
# Let's print those limits beyond which we will consider a value an outlier:
RM_upper_limit, RM_lower_limit

In [None]:
outliers_RM = np.where(boston['RM'] > RM_upper_limit, True, np.where(boston['RM'] < RM_lower_limit, True, False))
boston_trimmed = boston.loc[~(outliers_RM)]

In [None]:
sns.boxplot(boston_trimmed['RM'])
plt.show()

In [None]:
RM_upper_limit, RM_lower_limit = find_boundaries(boston, 'RM', 3, rule="meanstddev")
RM_upper_limit, RM_lower_limit

## Performing winsorization

**Winsorization, or winsorizing, is the process of transforming the data by limiting the
extreme values, that is, the outliers, to a certain arbitrary value, closer to the mean of the distribution**. 


In [None]:
boston = pd.read_csv("data/boston.csv")
boston.head()

In [None]:
import scipy.stats as stats

def diagnostic_plots(df, variable):
    # function takes a dataframe (df) and
    # the variable of interest as arguments

    # define figure size
    plt.figure(figsize=(16, 4))

    # histogram
    plt.subplot(1, 3, 1)
    sns.histplot(df[variable], bins=30, kde=True)
    plt.title('Histogram')

    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('RM quantiles')

    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[variable])
    plt.title('Boxplot')

    plt.show()

In [None]:
diagnostic_plots(boston, 'RM')

In [None]:
diagnostic_plots(boston, 'LSTAT')

In [None]:
diagnostic_plots(boston, 'CRIM')

In [None]:
from feature_engine.outliers import Winsorizer

windsorizer = Winsorizer(capping_method='quantiles', 
                          tail='both', # cap left, right or both tails 
                          variables=['RM', 'LSTAT', 'CRIM'])

windsorizer.fit(boston)

In [None]:
boston_t = windsorizer.transform(boston)

In [None]:
diagnostic_plots(boston_t, 'RM')

In [None]:
# we can inspect the minimum caps for each variable
windsorizer.left_tail_caps_

In [None]:
# we can inspect the maximum caps for each variable
windsorizer.right_tail_caps_

In [None]:
diagnostic_plots(boston_t, 'LSTAT')

In [None]:
diagnostic_plots(boston_t, 'CRIM')

## Capping the variable at arbitrary maximum and minimum values

In [None]:
boston = pd.read_csv("data/boston.csv")
boston.head()

In [None]:
from feature_engine.outliers import Winsorizer

windsorizer = Winsorizer(capping_method='gaussian', # choose iqr for skewed or gaussian
                          tail='both', # cap left, right or both tails 
                          fold=3,
                          variables=['RM', 'LSTAT', 'CRIM'])

windsorizer.fit(boston)

In [None]:
boston_t = windsorizer.transform(boston)

In [None]:
# we can inspect the minimum caps for each variable
windsorizer.left_tail_caps_

In [None]:
# we can inspect the maximum caps for each variable
windsorizer.right_tail_caps_