In [3]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore")

In [5]:
def create_population(prior_disease_prob, n):
    disease = round(n * prior_disease_prob)
    no_disease = round(n * (1 - prior_disease_prob))

    status = np.array(['Disease'] * disease  +  ['No disease'] * no_disease)
    result = np.array(['Test +'] * (disease) + ['Test +'] * (round(no_disease * 0.05))  + \
                 ['Test -'] * (round(no_disease * 0.95)))
                 
    t = Table().with_columns(
    'Status', status,
    'Test Result', result
    )
    return t.pivot('Test Result', 'Status')

## New material

**Poll questions on this example first**

### Here's a scenario (college course)

Here is the data for this example(s). Verify that the data looks as is in the question statements.

In [6]:
n = 100
second = round(n * 0.6)
third = round(n * 0.4)

year = np.array(['Second'] * second + ['Third'] * third)
major = np.array(['Declared'] * (round(second * 0.5)) + ['Undeclared'] * (round(second * 0.5)) + \
                 ['Declared'] * (round(third * 0.8))  + ['Undeclared'] * (round(third * 0.2)))
                 
students = Table().with_columns(
    'Year', year,
    'Major', major
)

In [8]:
students.show(3)

Year,Major
Second,Declared
Second,Declared
Second,Declared


In [9]:
students.pivot('Major', 'Year')

Year,Declared,Undeclared
Second,30,30
Third,32,8


Given that the person is declared, which person is more likely?

$\mathbb{P}(\text{Third|Declared}) = \frac{32}{62}$

In [12]:
32/62

0.5161290322580645

$\mathbb{P}({\text{Second|Declared}}) = \frac{30}{62}$

In [13]:
30/62

0.4838709677419355

### What do we do after we've classified a point?

Add the point to the training set!

In [14]:
students = students.with_rows(make_array(make_array('Third', 'Declared')))

In [15]:
students.pivot('Major', 'Year')

Year,Declared,Undeclared
Second,30,30
Third,33,8


$\mathbb{P}(\text{Third|Declared}) = \frac{33}{63}$

**STOP**

### Here's a second scenario (Doctors and clinical tests)

Create a population where the rate of prevalence is $\frac{1}{1000}$ and with size 1000. The function actually makes the dataset and then creates the pivot table for us.

**Challenge Question**: What are the dimensions (rows and columns) of the dataset from which the pivot table was created?

In [16]:
create_population(1/1000, 10000)

Status,Test +,Test -
Disease,10,0
No disease,500,9490


The probability we calculated, $\mathbb{P}(\text{Disease|Test +})$, is $\frac{10}{510}$

In [17]:
10/510

0.0196078431372549

### Changing the prior can change our classification

$$\mathbb{P}(\text{Disease|Test +}) = \frac{\mathbb{P}(\text{Test +|Disease})}{\mathbb{P}(\text{Test +|Disease}) + \mathbb{P}(\text{Test +|No Disease})}$$

#### "Assume a patient is selected at random"

In [None]:
(0.001 * 1) / (0.001*1 + 0.999*0.05)

In [None]:
random_selection_prior = 1/1000

In [None]:
(random_selection_prior * 1) / (random_selection_prior*1 + (1-random_selection_prior)*0.05)

In [None]:
create_population(random_selection_prior, 10000)

In [18]:
10/510

0.0196078431372549

#### One doctor's prior

In [19]:
one_doctors_prior = 100/1000

In [None]:
(one_doctors_prior * 1) / (one_doctors_prior*1 + (1-one_doctors_prior)*0.05)

In [20]:
create_population(100/1000, 10000)

Status,Test +,Test -
Disease,1000,0
No disease,450,8550


In [21]:
1000/1450

0.6896551724137931

#### Another doctor's prior

In [22]:
another_doctors_prior = 500/1000

In [None]:
(another_doctors_prior * 1) / (another_doctors_prior*1 + (1-another_doctors_prior)*0.05)

In [23]:
create_population(0.5, 10000)

Status,Test +,Test -
Disease,5000,0
No disease,250,4750


In [24]:
5000/5250

0.9523809523809523

In [28]:
def abs_sum(a,b,c,d,e):
    return abs(a) + abs(b) + a

In [29]:
minimize(abs_sum, start = make_array(10,-10,25,-25,5))

array([  1.83269508e-12,   1.48260591e-12,   2.96621654e-12,
         4.18967339e-12,  -1.71182514e-12])