In [1]:
import pandas as pd
import numpy as np

In [3]:
import plotly
import plotly.graph_objs as go

In [12]:
def hist_density(arr, bins, offset = 0.0001):
    # Using density of histogram to approximate the probability density function.
    try:
        arr = np.array(arr, dtype = np.float64)
    except:
        print("Invalid input array!")
    try:
        bins = np.array(bins, dtype = np.float64)
    except:
        print("Invalid bins!")
    assert len(arr) > 0, "Empty array!"
    assert len(bins) > 1, "Invalid bins size!"
    assert (bins[:-1] < bins[1:]).all(), "bins are not monotonic increasing!"
    bins = bins + offset
    # offset is added to achieve weak percentile, since np.histogram excludes the right boundary of each bin.
    if arr.min() < bins.min():
        bins = np.append(arr.min(), bins)
    return np.histogram(arr, bins)[0]*1.0/len(arr)

In [13]:
def trace_gen(data, bins, legend_name = ''):
    pdf = hist_density(data, bins, offset = 0.0001)
    cdf = pdf.cumsum()
    k = len(bins) - len(cdf)
    trace = go.Scatter(
        x = bins[k:],
        y = cdf,
        name = legend_name,
        hoverinfo = 'x+y'
    )
    return trace


def layout_gen(title, xtitle = '', ytitle = '', fontsize = 24, xfontsize = 18, yfontsize = 18, showlegend = False):
    layout = {
        'title': title,
        'titlefont':dict(size = fontsize),
        'xaxis':{
            'title': xtitle,
            'titlefont':dict(size = xfontsize),
        },
        'yaxis':{
            'title': ytitle,
            'titlefont':dict(size = yfontsize),
        },
        'showlegend' : showlegend
    }
    return layout

def plot_gen(traces, layout, filename):
    if type(traces) is not list:
        traces = [traces]
    if len(traces) > 1:
        layout['showlegend'] = True
    plotly.offline.plot(dict(data = traces, layout = layout), show_link = False,
                       filename = filename + '.html')

In [17]:
data1 = np.random.randn(10000) # 10000 sample of standard normal 
data2 = 0.3*np.random.randn(9000) + 1 # 9000 sample of N(mean = 1, sigma = 0.3)
bins = np.linspace(-5, 5, 1001) # generate 1000 bins

trace1 = trace_gen(data1, bins, legend_name = 'mean = 0, sigma = 1')
trace2 = trace_gen(data2, bins, legend_name = 'mean = 1, sigma = 0.3')
traces = [trace1, trace2]

title = 'CDF of Two Normal Distributions'
xtitle = 'x values'
ytitle = 'Proportion no greater than y'
layout = layout_gen(title, xtitle, ytitle)

filename = 'cdf-of-two-normal-distribution'
plot_gen(traces, layout, filename)