In [1]:
%matplotlib inline
from ipywidgets import interactive
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [31]:
from scipy.stats import beta

# Chapter 9

## Beta-distribution

Beta-distribution 
$$ \text{beta.pdf}(x, a, b) = \frac{\Gamma(a+b)}{\Gamma(a)\Gamma(b)} x^{a-1}(1-x)^{b-1} $$
is conjugate to the binomial distribution
$$ \text{binom.pdf}(p, k, N) = C_N^k x^k(1-x)^(N-k) $$

In [2]:
data_file = "data/Chapter-9-Beta-Dist-Example-for-three-industries-1.xlsx"

In [3]:
header = pd.read_excel(io=data_file, nrows=1, usecols=[0])

In [30]:
from IPython.display import Markdown as md
fr=2 #GHz
md("%s" % (header.iloc[0][0]
           .replace("“alpha”", "**“a”**")
           .replace("“beta”", "**“b”**")))

This shows how the beta distribution could be used to compare breach frequencies based on a few breaches in an industry.  Data from 2014 to the end of 2015 is shown.  You can set **“a”** and **“b”** as shown in the book to reflect “hits” and “misses” (i.e., breaches and non-breaches per company per year) to see how the estimate of breach frequency will change with even a single new breach reported.  In rows 4 to 7 (in yellow) you can enter the start year of the breach data for an industry, the end year, the number of breaches in that period, and the number of organizations in the sample.  These assume the organizations are either randomly sampled from some industry or a complete census of the industry.  Data shown was based on a particular random sample and was accurate until the beginning of 2016.

In [12]:
data = pd.read_excel(io=data_file, nrows=4, skiprows=2, usecols=[0, 1, 2, 3], index_col=0)

In [13]:
data = data.T

In [15]:
data['duration'] = data['Data up until year'] - data['Data since year']

In [16]:
data

Unnamed: 0,Data since year,Data up until year,Breaches since then,Companies in Population,duration
Healthcare,2014,2016,2,38,2
Retail,2014,2016,3,98,2
Finance,2014,2016,2,60,2


## Healthcare as an example

In [22]:
hc = data.loc['Healthcare']

In [23]:
hc

Data since year            2014
Data up until year         2016
Breaches since then           2
Companies in Population      38
duration                      2
Name: Healthcare, dtype: int64

In [38]:
def beta_plot(a=1, b=1):
    plt.figure(2)
    x = np.linspace(0, 0.22, 100)
    hit = hc['Breaches since then']
    miss = hc['Companies in Population'] * hc['duration']  
    y = beta.pdf(x, a+hit, b+miss)
    plt.plot(x, y)
    plt.ylim(0, max(y)+1)
    plt.show()
    
interactive_plot = interactive(beta_plot, a=(1, 5, 1), b=(1, 30, 1))
output = interactive_plot.children[-1]
output.layout.height = '350px'
interactive_plot

interactive(children=(IntSlider(value=1, description='a', max=5, min=1), IntSlider(value=1, description='b', m…