# Lecture 6
**The main material is included in the slides.**

We will go through the lecture slides first. Then we run through the demos in reversed order.

- von Neumann accept and reject method
- Cumulative Distribution Function 
- Linear Congruent Random Number generator
- Coin Toss and Binomial Distribution
- Poisson Distriubtion
- Gaussian Distribution and Central Limit Theorem

# von Neumann accept and reject method

My customized PDF is defined as
$$ f(x) = \frac{x sin(3x)}{ \int_{0}^{1} x sin(3x) dx}$$
the denominator is needed so that the PDF is normalized to 1

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def f(x):
    return x*np.sin(3.14*x)
x=np.linspace(0,1,1000000)
y=f(x)

#Integrate over x to get the Area Under the Curve
# sum( y_i * delta_x ) over i = 0 , 1, 2, ..., 1000000
# delta_x = 1e-6

AUC = np.sum(y)/1000000
print('Area under the Curve is %4.3f'% AUC)
plt.plot(x,y/AUC)
plt.xlabel('random variable x')
plt.ylabel('probability / bin width = 1e-6')
plt.text(0.01,1.7,"$ f(x) = \\frac{x sin(3x)}{ \int_{0}^{1} x sin(3x) dx}$",fontsize=20)
print('The maximum value on the curve after normalized to AUC %4.3f'%np.max(y/AUC))

The h(x) is a uniform distribution between 0 and 1. Two h(x) functions are defined, one for numpy array inputs and another for scalar input

In [None]:
def hx(x):
    return np.ones_like(x)

def hx_scalar(x):
    return 1

# at each point the hx value is 1, so that when integrate over 1e6 bins
# with bin width = 1e-6, the area under the curve is 1.

In [None]:
# Compute the y values of the h(x)
y2 = hx(x)
print(np.sum(y2))
print(np.max(y2))

# incorporating C to h(x)
# the new values of y2 are C*h(x)
C_factor = 2
y2=C_factor*y2

In [None]:
plt.plot(x,y2,linestyle='dashed', label="$Ch(x)$")
plt.plot(x,y/AUC,linestyle='solid',color='black',label="Target PDF")
plt.xlabel('random variable x')
plt.ylabel('probability / bin width = 1e-6')
plt.legend()

In [None]:
# Now let's perform the sampling
# an empty list data is created
# a for loop wiht 1e5 iterations is used
# in each iteration, the first thing is to 
# draw two random numbers, x from h(x), u from uniform
# my h(x) was conveniently chosen to a uniform
data=[]
for i in range(100000):
    r_from_hx = x[np.random.randint(1,1000000)]
    u = np.random.random()
    # So i draw two uniform numbers
    # the one for r_from_hx, is a bin index, since 
    # my original x sequence is uniformally spaced between 
    # 0 and 1 with 1e6 steps
    
    # Sanity check, commented out when not needed
    #print(u*2*0.000001, f(r_from_hx)/AUC)
    
    # Check accept reject condition
    if u*2*1 < f(r_from_hx)/AUC:
        data.append(r_from_hx)


data=np.array(data)    

In [None]:
plt.hist(data,bins=100,density = 1, range=(0,1),label="Generated Random Distribution")
plt.plot(x,y/AUC,label="Original PDF")
plt.legend()
plt.ylabel("Fraction of entries")
plt.xlabel("Random variable x")

# Cumulative Distribution Function
$$F(a) = \int_{-\infty}^{a} f(x)dx$$

In [None]:
rng = np.random.default_rng()


In [None]:
dataG = rng.normal(125,2,size=10000)

In [None]:
fig, axs = plt.subplots(2,1,figsize=(8,6), gridspec_kw={'height_ratios': [1, 1]})

plt.subplot(211)
bincontent, bins, others = plt.hist(dataG,density=True, bins=200, range=(115,135))
plt.xlabel('Normally distributed variable x')
plt.ylabel('Probability/Bin Width (= 0.1)')
plt.yscale('log')
y=[]
x=[]
print(bins.size)
for i in range(1,201):
   # print(i)
    y.append( np.sum(bincontent[:i])/np.sum(bincontent))
    x.append( (bins[i-1]+bins[i])*0.5)
plt.subplot(212)
plt.plot(x,y)
plt.text(125,0.15,"$F(a) = \int_{-\infty}^{a} f(x)dx$",fontsize=24)
plt.xlabel('Normally distributed variable x')
plt.ylabel('Cumulative Distribution Function')

# Linear Congruent Random Number

In [None]:
def xnplus1(xn, a, b , c):
    var = a*xn+b
    if var < c:
        return var
    else:
        while var >= c:
            var = var -c
        return var

In [None]:
#RANDU
a,b,c = 65539,0, 2**31

x=[3]
for i in range(10000):
    if i == 0:
        continue;
    var=xnplus1(x[i-1],a,b,c)
    x.append(var)

xnormalized = np.array(x)/c
    

In [None]:
plt.hist(xnormalized)

# Coin Toss on computer

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Generate integer random numbers between 0 and 1
data=np.random.randint(0,2,size=10) 
print(data)
# so 2 is not included


In [None]:
# You can generate negative random numbers too
data=np.random.randint(-10,2,size=10) 
print(data)

In [None]:
#Store outcomes in a numpy array with shape(3,10)
data=np.random.randint(0,2,(3,10))
print(data)

Repeat the cell below multiple times and see the variations in the outcome

In [None]:
# Generate integer random numbers with value between 0 and 1
data=np.random.randint(0,2,size=5000)
plt.hist(data,bins=2,range=(0,2))
plt.xticks([0,1,2])
plt.xlim(0,2)
plt.xlabel('binary random number')
plt.ylabel('entries')
plt.text(0.5,1000,'Number of outcomes with \n value 1 = %d' % np.sum(data),color='white',fontsize=16)

# Because we generate random numbers that are either 1 or 0,
# the sum of the array gives the number of 1's
print('Number of outcomes with value 1 = %d' % np.sum(data))



In [None]:
# Taking a subset and check how many outcomes are 1
# This subset has 40 entries
n=50
subset=data[n:n+40]
plt.hist(subset,bins=2,range=(0,2))
plt.xticks([0,1,2])
plt.xlim(0,2)
plt.xlabel('binary random number')
plt.ylabel('entries')
plt.text(0.5,10,'Number of outcomes with \n value 1 = %d' % np.sum(subset),color='white',fontsize=16)

# Because we generate random numbers that are either 1 or 0,
# the sum of the array gives the number of 1's
print('Number of outcomes with value 1 = %d' % np.sum(subset))

# Binomial probability distribution 

Let's run an experiment on computer. In this experiment, the outcome is either 0 or 1. For each experiment, the probability of observing 1 is $p, (p<1)$. The experiment is repeated $N$ times. We want to see the probability distribution of observing $r$ times of outcome 1.

We derive the binomial probability distribution function in three steps. 
- there are ${N \choose r}$ orderings for observining $r$ times of outcome 1 out of $N$ times experiment 
- for each one of the ${N \choose r}$ orderings, the probability of observing $r$ times outcome 1, out of $N$ experiments, when the probability of observing 1 for single experiment is $p$, is given by:
$$ p^{r}(1-p)^{(N-r)} $$
- finally, the binomial distribution is the product of these two numbers: 1) the number of all possible ordering and 2) the probability of observing $r$ times 1 out of $N$ experiments

$$ f(r,N) = {N \choose r} p^{r}(1-p)^{(N-r)} $$

$$        = \frac{N!}{r!(N-r)!} p^{r}(1-p)^{(N-r)}$$

# Number of ups in the experiment that consists of 100 coin tosses

Now I want to define a new experiment that consists of 40 coin tosses. The experiment has one outcome that is the number of ups. Then I repeat this experiment 1000 times, and I want to see the distribution of the outcome. 

This should be a binomial distribution with a mean of 20, and a standard deviation of $\sqrt{Np(1-p)}$ = $\sqrt{(40\times1/2\times(1-1/2))}$ = 3.16

In [None]:
# Generate binary random numbers 
# and store them in a 2-D array, with 1000 entries in axis0
# and 40 entries in axis 1
data=np.random.randint(0,2,(1000,40))
print(data.shape)

In [None]:
# the sum over axis 1 would give me 
# the number of ups for each
# experiement (corresponding to one entry in axis 0)
data=np.sum(data,axis=1)
print(data.shape)


In [None]:
plt.hist(data,bins=40,range=(0,40))
plt.xlabel("Number of ups out of 100 tosses")
plt.ylabel('Entries')
print('Mean of data %4.2f' %data.mean())
print('Standard Deviation of data %4.2f' %data.std())
plt.text(1,110,'Observed')
plt.text(1,100, 'Mean: %4.2f'%data.mean())
plt.text(1,80, 'Std. dev.: %4.2f'%data.std())

plt.text(30,110,'Binomial')
plt.text(30,100, 'Mean: %4.2f'% 20)
plt.text(30,80, 'Std. dev.: %4.2f'%3.16)


## New experiment (1 experiment = 100 tosses, Repeat N times)

In [None]:
# Now, in each experiment, there are 100 tosses
# Mean = 50
# variance = N*0.5*0.5 = 25
# Std = sqrt(variance) = 5

N = 1000
N_toss = 100

mean = 50
variance = 25
std = 5
threshold = 51

data=np.random.randint(0,2,(N,N_toss))
data=np.sum(data,axis=1)

Ntrial = len(data[np.where(data >= threshold)])
print('Mean of data %4.2f' %data.mean())

print('Mean of data %4.2f' %data.mean())
print('Standard Deviation of data %4.2f' %data.std())

plt.figure(0)

bincontent, binedges, others = plt.hist(data,bins=N_toss,range=(0,N_toss), color='green')
plt.text(10,np.max(bincontent)*0.8, 'Mean %4.2f'%data.mean() )
plt.text(10,np.max(bincontent)*0.7, 'Std. dev. %4.2f'%data.std() )
plt.text(10,np.max(bincontent)*0.6, 'Number of entries %d'%N)

plt.xlim(0,100)
plt.xlabel('Number of ups out of 100 tosses')
plt.ylabel('Number of entries')

print('Number of trials where there were at least %d ups = %d'% (threshold, Ntrial) )


# plt.figure(1)
# plt.hist(data,bins=N_toss,range=(N_toss*0.6/2, N_toss*1.4/2))

# Poisson distribution 

In [None]:
#poisson

# rng = np.random.default_rng(seed)
# is a way to initiate a random number generator, 
# with a specific seed

rng = np.random.default_rng(1)

print(np.random.poisson(50,size=5))
print(rng.poisson(50,size=5))


In [None]:
# Compare the output of the lines below
rng2 = np.random.default_rng(1243)
print(rng2.poisson(50,size=5))
rng3 = np.random.default_rng(1243)
print(rng3.poisson(50,size=5))
#these give you identical output
# as long as the seed is the same,  
# you get the same random sequence
# even though you have two different random generators (rng2 vs rng3)

# now change the seed for rng3
rng3 = np.random.default_rng(1243243)
print(rng3.poisson(50,size=5))



In [None]:
# Generate Poisson randon numbers with a mean of 50
# Store them into a 2D array of shape (4,5)
rng.poisson(50,(4,5))

## Poisson outcome (rerun the cell and see the variation)

In [None]:
mean = 50
threshold = 51
Nsize=10000
# expected std = sqrt(50) = 7.07
dataP=np.random.poisson(mean,size=Nsize)

print('Mean of data %4.2f' %dataP.mean())
print('Std. dev. of data %4.2f' %dataP.std())

print('Number of trials where the outcome is at least as large as %d  = %d'% (threshold, Ntrial) )

plt.figure(0)
bincontent, binedges, others = plt.hist(dataP,bins=N_toss,range=(0,100))
plt.xlabel('Outcome')
plt.ylabel('Entries')
plt.text(10, 0.9*np.max(bincontent),'Observed')
plt.text(10, 0.8*np.max(bincontent),'Mean %4.2f'%dataP.mean())
plt.text(10, 0.7*np.max(bincontent),'Std. Dev. %4.2f'%dataP.std())

plt.text(65, 0.9*np.max(bincontent),'Poisson')
plt.text(65, 0.8*np.max(bincontent),'Mean %4.2f'%mean)
plt.text(65, 0.7*np.max(bincontent),'Std. Dev. %4.2f'%np.sqrt(mean))

Ntrial = len(dataP[np.where(dataP >= threshold)])

In [None]:
#gaussian
dataG = np.random.normal(50,np.sqrt(50),size=10000)
bincontent, binedges, others = plt.hist(dataG,bins=100,range=(0,100))
plt.xlabel('Outcome')
plt.ylabel('Entries')
plt.text(10, 0.9*np.max(bincontent),'Observed')
plt.text(10, 0.8*np.max(bincontent),'Mean %4.2f'%dataP.mean())
plt.text(10, 0.7*np.max(bincontent),'Std. Dev. %4.2f'%dataP.std())

plt.text(65, 0.9*np.max(bincontent),'Gaussian')
plt.text(65, 0.8*np.max(bincontent),'Mean %4.2f'%mean)
plt.text(65, 0.7*np.max(bincontent),'Std. Dev. %4.2f'%np.sqrt(mean))

## Compare Gaussian and Poisson with the same mean (run with a few different means)

In [None]:
# poisson and gaussian with the same mean and variance

mean=400
Nbins = 40
variance = mean
sigma = np.sqrt(variance)
plotrange =(mean-4*sigma , mean+4*sigma)
samplesize = 100000




dataP = np.random.poisson(mean,size=samplesize)
dataG = np.random.normal(mean, np.sqrt(variance), size = samplesize)

bincontent,bins,others=plt.hist(dataP, bins=Nbins, range=plotrange, histtype='step', color='black', label= 'Poisson random numbers')
plt.hist(dataG, bins=Nbins, range=plotrange, histtype='step', linestyle='dashed', color='red', label='Gaussian random numbers')
plt.xlabel('random number value')
plt.ylabel('Entries')
plt.ylim(0.1,1.5*np.max(bincontent))
#plt.yscale()
plt.legend()

plt.text(mean-3.5*sigma, 0.9*np.max(bincontent),'Poisson')
plt.text(mean-3.5*sigma, 0.8*np.max(bincontent),'Mean %4.2f'%dataP.mean())
plt.text(mean-3.5*sigma, 0.7*np.max(bincontent),'Std. Dev. %4.2f'%dataP.std())

plt.text(mean+0.95*sigma, 0.9*np.max(bincontent),'Gaussian')
plt.text(mean+0.95*sigma, 0.8*np.max(bincontent),'Mean %4.2f'%dataG.mean())
plt.text(mean+0.95*sigma, 0.7*np.max(bincontent),'Std. Dev. %4.2f'%dataG.std())

### Reminder: how to use numpy methods to get standard deviation, variance and mean

In [None]:
#standard deviation
dataP_std = np.std(dataP)

#variance
dataP_var = np.var(dataP)

#mean
dataP_mean = np.mean(dataP)

print(dataP_std, dataP_var, dataP_std**2, dataP_mean)
print(np.std(dataG), np.var(dataG), np.std(dataG)**2, np.mean(dataG))

# Verify Central Limit Theorem

CLT claims that if a variable $X$ is the sum of **$N$** random numbers that are indepenendently and identically drawn from a probability distribution, then $X$ follows a Gaussian distribution when $N$ is large

$$X_N = \sum_{i=0}^{N} x_i$$

where $ x_i $ is drawn from $ f(x)$ which has a mean of $\mu$ and a std. dev of $\sigma$

The distribution of $X_N/N$ should have a mean of $\mu$ and a std. dev. of $\sigma/\sqrt{N}$


The random number is drawn from an artificially constructed sample, which includes two parts. The part 1 is a set of 100,000 uniformly distributed random numbers between 0 and 1, and the part 2 is a set of 100,000 Gaussian numbers with a mean of 2, and a sigma of 0.5.

The total size of this sample (numpy array) is 200,000.

To draw a random number from this sample, I randomly select the index of an entry of the numpy array by np.random.randint(200000), which gives me an integer somewhere between 0 and 200,000  
To build $X_N$, I draw 6 random numbers, i.e., I have $X_6$

In [None]:
part1 = np.random.random(100000)
part2 = np.random.normal(2,0.5,size=100000)
sample=np.hstack((part1,part2))

In [None]:

plt.hist(sample,bins=40,range=(0,4),density=True)
plt.xlabel('x')
plt.ylabel('fraction of entries')
plt.text(2.0,0.5, "Mean %4.2f"% np.mean(sample))
plt.text(2.0,0.45, "Std. Dev. %4.2f"% np.std(sample))


In [None]:
# Constructing X_6 = (1/6)*sum{ six randomly selected x}
x=[]
for i in range(3000):
    a = np.random.randint(200000)
    b = np.random.randint(200000)
    c = np.random.randint(200000)
    d = np.random.randint(200000)
    e = np.random.randint(200000)
    f = np.random.randint(200000)
    sum_over_n = (sample[a]+sample[b]+sample[c]+sample[d]+sample[e]+sample[f])/6.0 
    x.append(sum_over_n)

In [None]:
bincontent,binedges,others=plt.hist(x,bins=30,range=(0,3))
plt.xlabel('$X_6 = (x_1 + x_2 + x_3 + x_4 + x_5 + x_6)$/6')
plt.ylabel('Entries')

plt.text(1.9, np.max(bincontent)*0.9, "Mean %4.2f"% np.mean(x))
plt.text(1.9, np.max(bincontent)*0.8, "Std. Dev. %4.2f"% np.std(x))

plt.text(1.9, np.max(bincontent)*0.7, "Mean of $x_i$ %4.2f"% np.mean(sample))
plt.text(1.9, np.max(bincontent)*0.6, "Std. Dev. of $x_i$ / $\sqrt{6}$ = %4.2f"% np.std(sample/np.sqrt(6)))


# Scripts to generate plots used in lectures

In [None]:
mean=3.2
data=np.random.poisson(mean,size=10000)
plt.hist(data,bins=12,range=(0,12),density=True,histtype='step',color='red')
plt.xlim(0,12)
plt.ylim(0,0.3)
plt.ylabel('Probability',fontsize=24)
plt.xlabel('Outcome',fontsize=24)
#plt.text(6,0.25,'Mean = 3.2',fontsize=24)

In [None]:
mean=500
data=np.random.poisson(mean,size=10000)
plt.hist(data,bins=200,range=(400,600),density=True,histtype='step',color='red')
#plt.xlim(0,12)
plt.ylim(0.001,0.03)
plt.ylabel('Probability',fontsize=24)
plt.xlabel('Outcome',fontsize=24)
plt.yscale('log')
plt.text(500,0.02175,'Mean = 500',fontsize=24)

In [None]:
def factorial(n):
    if n == 0:
        return 1
    else:
        nfact = n
        for i in range(1,n):
            nfact = i*nfact
        return nfact
    

# Cauchy distrbituion

In [None]:
a = np.random.normal(0,5,size=1000)
b = np.random.normal(0,30,size=1000)

In [None]:
plt.hist(a/b,bins=50,range=(-1,1))