In [1]:
import os
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from utils import AutoBins

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
data = pd.read_csv('./data/bank.csv')

### Outlier: numeric feature

### This plot shows
- The distribution of numeric feature.
- When the value exceed the given criterion, add a mask to the exceeded range and change the bin color.

#### Main trace
We need to plot two traces, one is for outlier data, and the other one is for the remained data.
- Histogram
```python
trace = go.Histogram(
    x=bin_edges,
    y=counts,
    histfun='sum',
    xbins=go.Histogram.XBins(start=min(bin_edges), end=max(bin_edges), size=bin_width),
)
```

#### Layout: shape
Add a mask at the range where data exceed criterion.
```python
mask = dict(
    fillcolor= "rgba(246,178,107,0.5)",
    line={"width": 0},
    type="rect",
    layer='below',
    x0=x0, x1=x1, xref="x",
    y0=0, y1=1, yref="paper"
)
```
- layer: `below` and `above`. We use `below` here to avoid hiding the trace.

In [4]:
def sample_hist_data(data, col, max_row=5000):
    if data.shape[0] <= max_row:
        max_row = data.shape[0]
        
    w = len(data)/max_row
    sampled_data = data.sample(max_row, replace=False, random_state=42)
    _, sample_bin_width, sample_nbins = get_num_data(sampled_data, col)
    sample_counts, bin_edges = np.histogram(sampled_data[col], bins=sample_nbins)
    return sample_counts*w, bin_edges, sample_bin_width

In [5]:
def sliding_mask(x0=None, x1=None):
    return dict(
            fillcolor= "rgba(246,178,107,0.5)",
            line={"width": 0},
            type="rect",
            layer='below',
            x0=x0, x1=x1, xref="x",
            y0=0, y1=1, yref="paper")

In [6]:
def plot_numeric_outlier(data, col: str, lower_bound: int = None, upper_bound: int = None, width=722, height=448):
    arr = data[~data[col].isna()][col]
    
    autobins = AutoBins()
    nbins = np.nanmin([
        autobins.get_len_step(arr),
        autobins.get_mean_diff_step(arr),
        autobins.get_power_step(arr)
    ])
    
    min_x = min(arr)
    max_x = max(arr)
    bin_width = (max_x - min_x)/int(nbins)
    
    if lower_bound is None or lower_bound < min_x:
        lower_bound = None
    if upper_bound is None or upper_bound > max_x:
        upper_bound = None
    
    
    shapes = []
    main_arr = arr
    bound_arr = []
    if lower_bound is not None:
        shapes.append(sliding_mask(x0=min_x, x1=lower_bound))
        main_arr = main_arr[main_arr > lower_bound]
        bound_arr = list(arr[arr <= lower_bound])
    if upper_bound is not None:
        shapes.append(sliding_mask(x0=upper_bound, x1=max_x+bin_width))
        main_arr = main_arr[main_arr < upper_bound]
        bound_arr.extend(list(arr[arr >= upper_bound]))
        
    counts, bin_edges = np.histogram(main_arr, bins=nbins)

    fig = go.Figure()
    if len(bound_arr) > 0:
        bound_counts, bound_bin_edges = np.histogram(bound_arr, bins=nbins)
        _start = min(min(bin_edges), min(bound_bin_edges))
        _end = max(max(bin_edges), max(bound_bin_edges))
        fig.add_trace(
            go.Histogram(x=bound_bin_edges, 
                         y=bound_counts, 
                         histfunc='sum', 
                         marker=dict(color="rgba(236,78,78,0.8)"),
                         xbins=go.histogram.XBins(start=_start, end=_end, size=bin_width),
                         hovertemplate=col + " (%{x}): %{y} <extra></extra>",
                         name="outlier"
                        )
        )


    fig.add_trace(
        go.Histogram(x=bin_edges, 
                     y=counts, 
                     histfunc='sum', 
                     marker=dict(color="rgba(53,77,204,1)"),
                     xbins=go.histogram.XBins(start=min(bin_edges), end=max(bin_edges), size=bin_width),
                     hovertemplate=col + " (%{x}): %{y} <extra></extra>",
                     name="data"
                    )
    )

    fig.update_layout(
        title=dict(
                text=f'Numeric outlier: {col}',
                font=dict(size=22),
                y=0.99,
                x=0.0,
                xanchor='left',
                yanchor='top',
            ),
        width=width,
        height=height,
        barmode='stack',
        shapes=shapes,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(238,238,238,1)',
        modebar=dict(
            bgcolor='rgba(0,0,0,0)', activecolor='rgba(68,68,68,0.7)', color='rgba(68,68,68,0.3)',
            remove=['zoom', 'lasso', 'select'],
        ),
    )
    
    fig.update_xaxes(title=col)
    fig.update_yaxes(title='Count', 
                     type='log',
                     showticklabels=True,
                     dtick=1,
                     tickformat="f",
                     tickmode='auto',
                     ticklabeloverflow='allow',
                     mirror='allticks',
                    )

    return fig

In [7]:
fig = plot_numeric_outlier(data=data, col='balance', lower_bound=None, upper_bound=80000)
fig.show(config={'displaylogo': False})
# fig.write_html('./automl_plot/outlier_numeric.html', config={'displaylogo': False}, include_plotlyjs='cdn', full_html=False)