In [5]:
import math
import pandas as pd
import numpy as np

# Background

Let us start with a concrete example. Suppose we have a jar containing 100 marbles: 25 reds and 75 blues. If we randomly select 10 marbles, what is the probability that exactly 5 will be blue?

This is simply a counting problem: We count the number of ways of selecting 5 blues (from 75 blues) and 5 reds (from 25 reds), and divide this by the total number of ways of selecting 10 marbles (from 100 marbles).

#### Formally:

In [58]:
math.comb(75, 5) * math.comb(25, 5) / math.comb(100, 10)

0.05297371447965936

comments re:
ignoring of denominator
for now, we just want to count
ways of getting
specific samples from specfic jars

In [63]:
math.comb(75, 5) * math.comb(25, 5)

916991390700

In [65]:
'''
Given jar of N marbles containing B blues and R reds,
number of ways to get sample with b blues and r reds:

= [B choose b] * [R choose r]

= [B choose b] * [(N - B) choose (s - b)]

'''


# Total marbles in jar.
N = 100

# Blues in jar
B = 75

# Sample size.
s = 10

# Sample blues
b = 5



def ways_to_get_sample(jar_size, jar_blues, sample_size, sample_blues):
    return math.comb(jar_blues, sample_blues) * math.comb((jar_size - jar_blues), (sample_size - sample_blues))

print(ways_to_get_sample(N, B, s, b))


916991390700


Now think in opposite direction: given known N but unknown B,R we randomly sample and get b, r.

Concrete example: Suppose we get 5 blues, 5 reds.

Some questions we can ask:
- 

In [70]:
'''
Given random sample of 5 blues, 5 reds from N = 100,
our confidence that it came from the jar with B = 53:

= (Ways to get sample-10 with 5 blues from jar-53) / (Ways to get sample-10 with 5 blues from ALL jars)

Note the numerator is just our "ways_to_get_sample" function above.
So, now we just need to define the denominator function --
which won't have a "jar_blues" argument,
since we're summing over all its possible values.

'''
# Total marbles in jar.
N = 100

# Blues in jar
B = 50

# Sample size.
s = 10

# Sample blues
b = 5



def all_ways_to_get_sample(jar_size, sample_size, sample_blues):
    sum = 0
    for i in range(1, 101):
        sum += ways_to_get_sample(jar_size, i, sample_size, sample_blues)
    return sum


def confidence_came_from_jar_B(jar_size, jar_blues, sample_size, sample_blues):
    return ways_to_get_sample(jar_size, jar_blues, sample_size, sample_blues) / all_ways_to_get_sample(jar_size, sample_size, sample_blues)

print(confidence_came_from_jar_B(N, B, s, b))
                           


0.028244247608721643


In [82]:
'''
Given random sample b, r from N = 100,
our confidence it came from jar where Min <= B <= Max:

= (Ways to get sample-10 with 5 blues from jars in interval) / (Ways to get sample-10 with 5 blues from ALL jars)

'''

# Total marbles in jar.
N = 100

# Lower bound blues in jar
L = 41

# Upper bound blues in jar
U = 59

# Sample size.
s = 40

# Sample blues
b = 20


def interval_ways_to_get_sample(jar_size, jar_blues_min, jar_blues_max, sample_size, sample_blues):
    sum = 0
    for i in range(jar_blues_min, jar_blues_max + 1):
        sum += ways_to_get_sample(jar_size, i, sample_size, sample_blues)
    return sum
        


def confidence_came_from_jar_interval(jar_size, jar_blues_min, jar_blues_max, sample_size, sample_blues):
    num = interval_ways_to_get_sample(jar_size, jar_blues_min, jar_blues_max, sample_size, sample_blues)
    den = all_ways_to_get_sample(jar_size, sample_size, sample_blues)
    return num / den

print(confidence_came_from_jar_interval(N, L, U, s, b))

0.8881600208203463
