In [1]:
import os
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from utils import AutoBins

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
data = pd.read_csv('./data/bank.csv')

### Numeric feature histogrm

### This plot shows
- Histogram of numeric feature.
- Use two buttons. One for feature selection, and the other for log scale.
- Show bin range and its counts when hover on each bins.

#### Main traces
- Histogram
```python
trace = go.Histogram(
    x=x_data,
    autobinx=False,
    xbins=go.Histogram.XBins(start=min(x_data), end=max(x_data)+bin_width, size=bin_width),
    hovertemplate="Bin range (%{x}): %{y}<extra></extra>",
)
```

#### Layout: button
Use button to update data and layout of the plot.
- fig.update_layout: `updatemenus` that a list of dictionary should be given.
- A single dictionary in updatemenus:
    * type: "dropdown" or "buttons".
    * direction: the direction in which the buttons are laid out. "left", "right", "up" and "down".
    * buttons: a list or a tuple of dicts of properties or instances of Button
        * method: it has three methods, **restyle**, **relayout** and **update**.
            - restyle: only update data in trace.
            - relayout: only update layout of the plot.
            - update: update data and layout at the same time.
        * args: a list or tuple of up to 3 elements. Note that **the x data and y data should be given in a list**. If `type` in data properties is not provided, the trace will follow the main trace's type.
            - restyle method: `[{data properties}]`
            - relayout method: `[{layout properties}]`
            - update method: `[{data properties}, {layout properties}]`
          
        * args2: the same as args. Use this to create toggle button.
        * label: the name showed on the button.
        * active: determines which button is considered active.

**log scale** toggle button example:  
This button aims to change the scale of yaxis between type "-" and type "log". We need to create a toggle button and only need to update the layout of the plot; hence, we use **relayout** method.
```python
dict(
    type="buttons",
    buttons=[
    dict(
        args=[{'yaxis': origin_config}],
        args2=[{'yaxis': log_config}], # create a toggle button
        method='relayout',
        label='log scale'
    )],
    active=0, # it means yaxis with type "-" is active
)
```

**feature selection** dropdown button example:  
This button is to select different feature and show its histogram. We need to update data and layout(xaxis label) at the same time; hence, we use **update** method.
```python
dict(
    type="dropdown",
    direction="down",
    buttons=[
    dict(
        args=[{'type': 'histogram', 'x':[...], 'y':[...], 'histfun': 'sum', 'xbins': ..., ...}, {'xaxis':{...}}],
        method='update',
        label=feature_name,
    )],
    active=0, # it means yaxis with type "-" is active
)
```

In [4]:
def get_num_data(data, col):
    arr = data[~data[col].isna()][col]
    autobins = AutoBins()
    nbins = np.nanmin([
        autobins.get_len_step(arr),
        autobins.get_mean_diff_step(arr),
        autobins.get_power_step(arr)
    ])
    min_x = min(arr)
    max_x = max(arr)
    bin_width = (max_x - min_x)/int(nbins)
    
    return arr, bin_width, nbins

In [5]:
def plot_numeric_histogram(data, num_cols: str, fig=None, width=700, height=450):
    if fig is None:
        fig = go.Figure()
    
    _x, _bin_width, _ = get_num_data(data, num_cols[0])
    fig.add_trace(
        go.Histogram(x=_x,
                     autobinx=False,
                     xbins=go.histogram.XBins(start=min(_x), end=max(_x)+_bin_width, size=_bin_width),
                     hovertemplate="Bin range (%{x}): %{y}<extra></extra>",
                    )
    )

    
    fig.update_layout(
        title=dict(
                text=f'Numeric Feature',
                font=dict(size=22),
                y=0.99,
                x=0.00,
                xanchor='left',
                yanchor='top',
            ),
        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Rockwell"),
        width=width,
        height=height,
        barmode='overlay',
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(238,238,238,1)',
        modebar=dict(
            bgcolor='rgba(0,0,0,0)', activecolor='rgba(68,68,68, 0.7)', color='rgba(68,68,68,0.3)',
            remove=['zoom', 'zoom', 'lasso', 'select'],
        ),
    )

    fig.update_xaxes(title=num_cols[0])
    fig.update_yaxes(title='Counts', type='-', ticksuffix=' ')
    
    
    log_config = dict(title='log(Counts)', type='log', showticklabels=True, dtick=1, tickformat="f", 
                      tickmode='auto', ticklabeloverflow='allow', ticksuffix=' ', mirror='allticks')
        
    buttons = []
    for col in num_cols:
        _x, _bin_width, _ = get_num_data(data, col)
        _button = dict(args=[{'type': 'histogram', 
                              'x': [_x], 
                              'xbins': [go.histogram.XBins(start=min(_x), end=max(_x)+_bin_width, size=_bin_width)],
                             },
                             {'xaxis': {'title': str(col)},}
                            ],
                       label=str(col), method='update',)
        buttons.append(_button)

    fig.add_annotation(text="Feature name: ", showarrow=False, font={'size':13},
                       x=0, xref="paper", y=1.16, yref="paper", align="left",)

    fig.update_layout(
        updatemenus=[
            dict(
                type = "dropdown",
                direction = "down",
                buttons=buttons,
                active=0,
                x=0.2, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(255,255,255,1)',
            ),
            dict(
                type='buttons',
                buttons=[dict(
                    args=[{'yaxis': dict(title='Counts', type='-', ticksuffix=' ')}], 
                    args2=[{'yaxis': log_config}], 
                    method='relayout',
                    label='log scale',
                )],
                active=0,
                x=0.4, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(238,238,238,1)',
            ),
        ]
    )

    return fig

In [6]:
fig = plot_numeric_histogram(data.sample(5000), num_cols=['age', 'balance', 'day', 'duration', 'campaign'], fig=None, width=700, height=450)
# fig = plot_numeric_histogram(data.sample(5000), num_cols=['age', 'balance', 'day', 'duration', 'campaign'], fig=None, width=700, height=450)
fig.show(config={'displaylogo':False})
# fig.write_html('numeric_feature.html', config={'displaylogo': False}, include_plotlyjs='cdn', full_html=False)

### Numeric feature histogram with sampling 

### Sample data and compute its histogram
- Let `max_row` be the maximun number of data to plot.
- Use `np.histogram` to compute value of each bins (count) and bin edges (bin_edges).  
    * number of bins is calculated with `AutoBins`
- Scale the value of each bins with a weight, $\text{weight} = \frac{\text{# data}}{\text{max_row}}$

### This plot shows
- Histogram of numeric feature.
- Use two buttons. One for feature selection, and the other for log scale.
- Show bin range and its counts when hover on each bins.

#### Main traces
- Histogram
```python
trace = go.Histogram(
    x=bin_edges,
    y=count,
    histfunc='sum',
    xbins=go.Histogram.XBins(start=min(bin_edges), end=max(bin_edges), size=bin_width),
    customdata=customdata,
    hovertemplate="Bin range (%{customdata[0]:.3f} - %{customdata[1]:.3f}) : %{y} <extra></extra>"
)
```
    - custimize hovertemplate  
    `hovertemplate` defines the text showing on the hover.  
    `customdata` defines data that `hovertemplate` receives. Its format is 
    ```python
    [(bin_edge[0], bin_edge[1]), (bin_edge[1], bin_edge[2]), ..., (bin_edge[n-1], bin_edge[n])]
    ```

In [7]:
def sample_hist_data(data, col, max_row=5000):
    if data.shape[0] <= max_row:
        max_row = data.shape[0]
        
    w = len(data)/max_row
    sampled_data = data.sample(max_row, replace=False, random_state=42)
    _, sample_bin_width, sample_nbins = get_num_data(sampled_data, col)
    sample_counts, bin_edges = np.histogram(sampled_data[col], bins=sample_nbins)
    return sample_counts*w, bin_edges, sample_bin_width

In [8]:
def plot_sampling_numeric_histogram(data, num_cols: str, fig=None, width=700, height=450, max_row=5000):
    if fig is None:
        fig = go.Figure()
    
    _y, _x, _bin_width = sample_hist_data(data, num_cols[0], max_row=max_row)
    customdata = list(zip(_x[:-1], _x[1:]))
    fig.add_trace(
        go.Histogram(x=_x, 
                     y=_y, 
                     histfunc='sum', 
                     xbins=go.histogram.XBins(start=min(_x), end=max(_x), size=_bin_width),
                     customdata=customdata,
                     hovertemplate="Bin range (%{customdata[0]:.3f} - %{customdata[1]:.3f}) : %{y} <extra></extra>"
                    )
    )
    
    fig.update_layout(
        title=dict(
                text=f'Numeric Feature',
                font=dict(size=22),
                y=0.99,
                x=0.00,
                xanchor='left',
                yanchor='top',
            ),
        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Rockwell"),
        width=width,
        height=height,
        barmode='overlay',
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(238,238,238,1)',
        modebar=dict(
            bgcolor='rgba(0,0,0,0)', activecolor='rgba(68,68,68, 0.7)', color='rgba(68, 68, 68, 0.3)',
            remove=['zoom', 'zoom', 'lasso', 'select'],
        ),
    )

    fig.update_xaxes(title=num_cols[0])
    fig.update_yaxes(title='Counts', type='-', ticksuffix=' ')
    
    
    log_config = dict(title='log(Counts)', type='log', showticklabels=True, dtick=1, tickformat="f", 
                      tickmode='auto', ticklabeloverflow='allow', ticksuffix=' ', mirror='allticks')
        
    buttons = []
    for col in num_cols:
        _y, _x, _bin_width = sample_hist_data(data, col, max_row=max_row)
        _customdata = list(zip(_x[:-1], _x[1:]))
        _button = dict(
            args=[
                {'type': 'histogram', 
                 'x': [_x], 
                 'y': [_y],
                 'histfunc': ['sum'],
                 'xbins': [go.histogram.XBins(start=min(_x), end=max(_x), size=_bin_width)],
                 'customdata': [_customdata],
                },
                {'xaxis': {'title': str(col)}}
            ],
            label=str(col), method='update',)
        buttons.append(_button)

    fig.add_annotation(text="Feature name: ", showarrow=False, font={'size':13},
                       x=0, xref="paper", y=1.16, yref="paper", align="left",)

    fig.update_layout(
        updatemenus=[
            dict(
                type = "dropdown",
                direction = "down",
                buttons=buttons,
                active=0,
                x=0.2, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(255,255,255,1)',
            ),
            dict(
                type='buttons',
                buttons=[dict(
                    args=[{'yaxis': dict(title='Counts', type='-', ticksuffix=' ')}], 
                    args2=[{'yaxis': log_config}], 
                    method='relayout',
                    label='log scale',
                )],
                active=0,
                x=0.4, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(238,238,238,1)',
            ),
        ]
    )

    return fig

In [9]:
fig = plot_sampling_numeric_histogram(data, 
                                      num_cols=['age', 'balance', 'day', 'duration', 'campaign'], 
                                      fig=None, width=700, height=450, max_row=50000)
fig.show(config={'displaylogo':False})
# fig.write_html('./example_plots/numeric_feature.html', config={'displaylogo':False}, include_plotlyjs='cdn', full_html=False)