In [1]:
import os
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from automl.analysis.ploting import AutoBins

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
data = pd.read_csv('./data/bank.csv')

In [4]:
def get_num_data(data, col):
    arr = data[~data[col].isna()][col]
    autobins = AutoBins()
    nbins = np.nanmin([
        autobins.get_len_step(arr),
        autobins.get_mean_diff_step(arr),
        autobins.get_power_step(arr)
    ])
    min_x = min(arr)
    max_x = max(arr)
    step = (max_x - min_x)/int(nbins) + 1
    
    return arr, step, nbins

In [5]:
def plot_numeric_histogram(data, num_cols: str, fig=None, width=700, height=450):
    if fig is None:
        fig = go.Figure()
    

    fig.add_trace(
        go.Histogram(x=data[num_cols[0]],
                     autobinx=False,
                     xbins=go.histogram.XBins(start=min(get_num_data(data, num_cols[0])[0]), end=max(get_num_data(data, num_cols[0])[0]), size=get_num_data(data, num_cols[0])[1]),
                     opacity=1,
                     hovertemplate="Bin range (%{x}): %{y}<extra></extra>",
                    )
    )

    
    fig.update_layout(
        title=dict(
                text=f'Numeric Feature',
                font=dict(size=22),
                y=0.99,
                x=0.00,
                xanchor='left',
                yanchor='top',
            ),
        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Rockwell"),
        width=width,
        height=height,
        barmode='overlay',
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(238,238,238,1)',
        modebar=dict(
            bgcolor='rgba(0,0,0,0)', activecolor='rgba(68,68,68, 0.7)', color='rgba(68, 68, 68, 0.3)',
            remove=['zoom', 'zoom', 'lasso', 'select'],
        ),
    )

    fig.update_xaxes(title=num_cols[0])
    fig.update_yaxes(title='Counts', type='-', ticksuffix=' ')
    
    
    log_config = dict(title='Counts', type='log', showticklabels=True, dtick=1, tickformat="f", 
                      tickmode='auto', ticklabeloverflow='allow', ticksuffix=' ', mirror='allticks')
        
    buttons = []
    for col in num_cols:
        _button = dict(args=[{'type': 'histogram', 
                                 'x': [get_num_data(data, col)[0]], 
                                 'xbins': [go.histogram.XBins(start=min(get_num_data(data, col)[0]), end=max(get_num_data(data, col)[0]), size=get_num_data(data, col)[1])],
                                },
                                {'xaxis': {'title': str(col)},
                                }
                               ],
                       label=str(col), method='update',)
        buttons.append(_button)

    fig.add_annotation(text="Feature name: ", showarrow=False, font={'size':13},
                       x=0, xref="paper", y=1.16, yref="paper", align="left",)

    fig.update_layout(
        updatemenus=[
            dict(
                type = "dropdown",
                direction = "down",
                buttons=buttons,
                active=0,
                x=0.2, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(255,255,255,1)',
            ),
            dict(
                type='buttons',
                buttons=[dict(
                    args=[{'yaxis': dict(title='Counts', type='-', ticksuffix=' ')}], 
                    args2=[{'yaxis': log_config}], 
                    method='relayout',
                    label='log scale',
                )],
                active=0,
                x=0.4, xanchor="left",
                y=1, yanchor="top",
                pad={"r": 0, "t": -50},
                bgcolor='rgba(238,238,238,1)',
            ),
        ]
    )

    return fig

In [6]:
# fig = plot_numeric_histogram(data, num_cols=['age', 'balance', 'day', 'duration', 'campaign'], fig=None, width=700, height=450)
fig = plot_numeric_histogram(data.sample(5000), num_cols=['age', 'balance', 'day', 'duration', 'campaign'], fig=None, width=700, height=450)
fig.show(config={'displaylogo':False})
# fig.write_html('numeric_feature.html', config={'displaylogo': False}, include_plotlyjs='cdn', full_html=False)

### Ploting with sampling

In [7]:
*_, nbins = get_num_data(data.sample(5000), 'age')
nbins

73

In [8]:
*_, nbins = get_num_data(data, 'age')
counts, _ = np.histogram(data['age'], bins=nbins)

*_, sample_nbins = get_num_data(data.sample(5000), 'age')
sample_counts, _ = np.histogram(data['age'].sample(5000), bins=sample_nbins)

weights = counts[np.argmax(counts)]/sample_counts[np.argmax(counts)]

In [9]:
counts, bin_edges = np.histogram(data['age'].sample(5000), bins=sample_nbins, weights=[weights]*5000)

fig = go.Figure()
fig.add_trace(
    go.Histogram(x=bin_edges, y=counts, histfunc='sum', nbinsx=len(bin_edges))
)

fig.show()
# fig.write_html('test_hist.html', config={'displaylogo': False}, include_plotlyjs='cdn', full_html=False)

In [10]:
counts, bin_edges

(array([  84.24242424,   73.71212121,   52.65151515,  189.54545455,
         157.95454545,  358.03030303,  579.16666667,  968.78787879,
         958.25757576, 1221.51515152, 1253.10606061, 1863.86363636,
        2537.8030303 , 2558.86363636, 2221.89393939, 2211.36363636,
        1874.39393939, 1958.63636364, 1874.39393939, 1800.68181818,
        1916.51515152, 1569.01515152, 1621.66666667, 1611.13636364,
        1347.87878788, 1442.65151515, 1632.1969697 , 1358.40909091,
        1137.27272727, 1010.90909091, 1158.33333333, 1021.43939394,
        1063.56060606, 1042.5       , 1158.33333333,  842.42424242,
         916.13636364, 2042.87878788,  905.60606061,  905.60606061,
         768.71212121,  105.3030303 ,  105.3030303 ,  105.3030303 ,
         105.3030303 ,  136.89393939,  126.36363636,   63.18181818,
          42.12121212,   52.65151515,   73.71212121,   52.65151515,
          21.06060606,   52.65151515,   31.59090909,   10.53030303,
          42.12121212,   42.12121212,   42.12121

In [11]:
counts, bin_edges = np.histogram(data['age'].sample(5000), bins=76)
counts, bin_edges

(array([  1,   8,   2,   6,  14,  17,  30,  52,  86, 106, 107, 110, 204,
        226, 238,   0, 230, 231, 235, 201, 201, 141, 148, 138, 145, 150,
        138, 134, 135, 135,   0, 111, 100, 107, 113, 110,  91,  98,  88,
         88,  83,  88,  85,  79,  59,   0,  17,   9,   5,   9,   8,   7,
          6,   3,   5,   2,   4,   8,   5,   3,   0,   5,   2,  10,   4,
          3,   5,   2,   3,   3,   0,   0,   0,   1,   1,   1]),
 array([18.        , 18.93421053, 19.86842105, 20.80263158, 21.73684211,
        22.67105263, 23.60526316, 24.53947368, 25.47368421, 26.40789474,
        27.34210526, 28.27631579, 29.21052632, 30.14473684, 31.07894737,
        32.01315789, 32.94736842, 33.88157895, 34.81578947, 35.75      ,
        36.68421053, 37.61842105, 38.55263158, 39.48684211, 40.42105263,
        41.35526316, 42.28947368, 43.22368421, 44.15789474, 45.09210526,
        46.02631579, 46.96052632, 47.89473684, 48.82894737, 49.76315789,
        50.69736842, 51.63157895, 52.56578947, 53.5       ,