In [1]:
import os
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from utils import AutoBins

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
data = pd.read_csv('./data/bank.csv')

In [4]:
def get_num_data(data, col):
    arr = data[~data[col].isna()][col]
    autobins = AutoBins()
    nbins = np.nanmin([
        autobins.get_len_step(arr),
        autobins.get_mean_diff_step(arr),
        autobins.get_power_step(arr)
    ])
    min_x = min(arr)
    max_x = max(arr)
    step = (max_x - min_x)/int(nbins) + 1
    
    return arr, step, nbins

In [5]:
def plot_numeric_histogram(data, num_cols: str, fig=None, width=700, height=450):
    if fig is None:
        fig = go.Figure()
    

    fig.add_trace(
        go.Histogram(x=data[num_cols[0]],
                     autobinx=False,
                     xbins=go.histogram.XBins(start=min(get_num_data(data, num_cols[0])[0]), end=max(get_num_data(data, num_cols[0])[0]), size=get_num_data(data, num_cols[0])[1]),
                     opacity=1,
                     hovertemplate="Bin range (%{x}): %{y}<extra></extra>",
                    )
    )

    
    fig.update_layout(
        title=dict(
                text=f'Numeric Feature',
                font=dict(size=22),
                y=0.99,
                x=0.00,
                xanchor='left',
                yanchor='top',
            ),
        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Rockwell"),
        width=width,
        height=height,
        barmode='overlay',
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(238,238,238,1)',
        modebar=dict(
            bgcolor='rgba(0,0,0,0)', activecolor='rgba(68,68,68, 0.7)', color='rgba(68, 68, 68, 0.3)',
            remove=['zoom', 'zoom', 'lasso', 'select'],
        ),
    )

    fig.update_xaxes(title=num_cols[0])
    fig.update_yaxes(title='Counts', type='-', ticksuffix=' ')
    
    
    log_config = dict(title='Counts', type='log', showticklabels=True, dtick=1, tickformat="f", 
                      tickmode='auto', ticklabeloverflow='allow', ticksuffix=' ', mirror='allticks')
        
    buttons = []
    for col in num_cols:
        _button = dict(args=[{'type': 'histogram', 
                                 'x': [get_num_data(data, col)[0]], 
                                 'xbins': [go.histogram.XBins(start=min(get_num_data(data, col)[0]), end=max(get_num_data(data, col)[0]), size=get_num_data(data, col)[1])],
                                },
                                {'xaxis': {'title': str(col)},
                                }
                               ],
                       label=str(col), method='update',)
        buttons.append(_button)

    fig.add_annotation(text="Feature name: ", showarrow=False, font={'size':13},
                       x=0, xref="paper", y=1.16, yref="paper", align="left",)

    fig.update_layout(
        updatemenus=[
            dict(
                type = "dropdown",
                direction = "down",
                buttons=buttons,
                active=0,
                x=0.2, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(255,255,255,1)',
            ),
            dict(
                type='buttons',
                buttons=[dict(
                    args=[{'yaxis': dict(title='Counts', type='-', ticksuffix=' ')}], 
                    args2=[{'yaxis': log_config}], 
                    method='relayout',
                    label='log scale',
                )],
                active=0,
                x=0.4, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(238,238,238,1)',
            ),
        ]
    )

    return fig

In [6]:
# fig = plot_numeric_histogram(data, num_cols=['age', 'balance', 'day', 'duration', 'campaign'], fig=None, width=700, height=450)
fig = plot_numeric_histogram(data.sample(5000), num_cols=['age', 'balance', 'day', 'duration', 'campaign'], fig=None, width=700, height=450)
fig.show(config={'displaylogo':False})
# fig.write_html('numeric_feature.html', config={'displaylogo': False}, include_plotlyjs='cdn', full_html=False)

### Ploting with sampling

In [7]:
*_, nbins = get_num_data(data.sample(5000), 'age')
nbins

69

In [8]:
*_, nbins = get_num_data(data, 'age')
counts, _ = np.histogram(data['age'], bins=nbins)

*_, sample_nbins = get_num_data(data.sample(5000), 'age')
sample_counts, _ = np.histogram(data['age'].sample(5000), bins=sample_nbins)

weights = counts[np.argmax(counts)]/sample_counts[np.argmax(counts)]

In [9]:
counts, bin_edges = np.histogram(data['age'].sample(5000), bins=sample_nbins, weights=[weights]*5000)

fig = go.Figure()
fig.add_trace(
    go.Histogram(x=bin_edges, y=counts, histfunc='sum', nbinsx=len(bin_edges))
)

fig.show()
# fig.write_html('test_hist.html', config={'displaylogo': False}, include_plotlyjs='cdn', full_html=False)

In [10]:
counts, bin_edges

(array([  93.35820896,  124.47761194,  134.85074627,  217.8358209 ,
         248.95522388,  674.25373134,  892.08955224,  881.71641791,
        1182.53731343, 1379.62686567, 2105.74626866, 2333.95522388,
        2614.02985075, 2603.65671642, 2167.98507463, 2043.50746269,
        2085.        , 2167.98507463, 1721.94029851, 1493.73134328,
        1431.49253731, 1535.2238806 , 1296.64179104, 1462.6119403 ,
        1089.17910448, 1421.11940299, 1421.11940299, 1120.29850746,
        1130.67164179, 1058.05970149, 1244.7761194 , 1006.19402985,
        1109.92537313,  809.10447761,  985.44776119, 1701.19402985,
         902.46268657,  933.58208955,  663.88059701,  736.49253731,
         228.20895522,  145.2238806 ,  114.10447761,  114.10447761,
          82.98507463,   62.23880597,   41.49253731,   41.49253731,
          51.86567164,   82.98507463,  103.73134328,  103.73134328,
          72.6119403 ,  114.10447761,   41.49253731,   20.74626866,
          41.49253731,   20.74626866,   20.74626

In [11]:
counts, bin_edges = np.histogram(data['age'].sample(5000), bins=76)
counts, bin_edges

(array([  1,   3,   4,   5,  20,  24,  24,  51,  80,  99, 104, 104, 205,
        215, 226, 217, 202, 217, 199, 187, 180, 153, 132, 138, 145, 133,
        144, 145, 136, 120, 108, 106, 115,  93,  84, 132,  86,  85, 101,
        111,  81,  88,  61,  17,   8,   6,  11,   7,   7,  14,   4,   4,
          3,   6,   9,   4,   3,   6,   3,   3,   5,   0,   4,   3,   2,
          2,   1,   0,   0,   0,   0,   2,   0,   0,   0,   2]),
 array([18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30.,
        31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43.,
        44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56.,
        57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
        70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80., 81., 82.,
        83., 84., 85., 86., 87., 88., 89., 90., 91., 92., 93., 94.]))