# Read data using numpy

In [9]:
import csv 
header = []
data = []

with open('data/final.csv', 'r') as datafile:
    csv_read = csv.reader(datafile)
    #csv_read = csv.reader(datafile, delimiter=' ')  # Specify space as delimite)
    header = next(csv_read)

    data = [row for row in csv_read]
    # Parse the data and convert each entry to a float
    #data = [[float(value) for value in row] for row in csv_read]
head = data[:5]
print(head)
data_array = np.array(data)
num_rows, num_columns = data_array.shape
print(num_rows)
print(num_columns)

# Dependence & Independence

In [10]:
import random
def calculate_probabilities():
    # Probabilities of the individual events
    P_A = 1 / 6  # Probability of rolling a 6
    P_B = 1 / 2  # Probability of flipping "Heads"

    # Combined probability of both events happening (independent)
    P_A_and_B = P_A * P_B

    print(f"P(A): Probability of rolling a 6 = {P_A}")
    print(f"P(B): Probability of flipping 'Heads' = {P_B}")
    print(f"P(A ∩ B): Combined probability (independence assumption) = {P_A_and_B}")

    # Simulate to verify
    print("\nSimulation to verify independence:")
    total_trials = 100000
    count_A = 0
    count_B = 0
    count_A_and_B = 0

    for _ in range(total_trials):
        die_roll = random.randint(1, 6)
        coin_flip = random.choice(["Heads", "Tails"])

        if die_roll == 6:
            count_A += 1
        if coin_flip == "Heads":
            count_B += 1
        if die_roll == 6 and coin_flip == "Heads":
            count_A_and_B += 1

    # Estimated probabilities from simulation
    simulated_P_A = count_A / total_trials
    simulated_P_B = count_B / total_trials
    simulated_P_A_and_B = count_A_and_B / total_trials

    print(f"Simulated P(A): {simulated_P_A}")
    print(f"Simulated P(B): {simulated_P_B}")
    print(f"Simulated P(A ∩ B): {simulated_P_A_and_B}")
    print(f"Check: P(A) * P(B) ≈ {simulated_P_A * simulated_P_B}")

if __name__ == "__main__":
    calculate_probabilities()


P(A): Probability of rolling a 6 = 0.16666666666666666
P(B): Probability of flipping 'Heads' = 0.5
P(A ∩ B): Combined probability (independence assumption) = 0.08333333333333333

Simulation to verify independence:
Simulated P(A): 0.1686
Simulated P(B): 0.49761
Simulated P(A ∩ B): 0.08427
Check: P(A) * P(B) ≈ 0.083897046


# Binomial distribution

In [11]:
# Hint the PMF of N is p_N(k) where p_N is
from scipy.special import binom as binomial
p = 6/10 # probability 
p_N = lambda k: binomial(10,k)*((1-p)**(10-k))*((p)**k)

# Empirical mean 

In [12]:
from random import randint
def X():
    """Produces a single random number from DeMoivre(1/3,1/3,1/3)"""
    return randint(0,2)

def empirical_mean(n=1):
    """Produces the empirical mean of n experiments of the X above"""
    Z = [X() for i in range(n)]
    return sum(Z)/n

# Chebychev's Inequality
Consider an i.i.d. sequence of random variables $X_1,\ldots,X_n$ each being Bernoulli($1/2$). Then the concept of concentration is telling us that

$$
    P\left ( \left | \frac{1}{n} \sum_{i=1}^n X_i - \mathbb{E}(X_i) \right | > \epsilon \right )
$$

gets smaller as $n$ gets larger. For instance, using Chebychevs inequality we get

$$
    P\left ( \left | \frac{1}{n} \sum_{i=1}^n X_i - \mathbb{E}(X_i) \right | > \epsilon \right ) \leq \frac{\mathbb{V}\left( \frac{1}{n} \sum_{i=1}^n X_i \right )}{\epsilon^2} = \frac{\mathbb{V}\left( X_0 \right )}{\epsilon^2 n}
$$


In [13]:
def chebyshev_bound(k, n, mean=0, std_dev=1):
    """
    Computes the Chebyshev bound for a given k, n, mean, and standard deviation.
    
    Parameters:
    - k: Threshold for deviation from the mean
    - n: Number of samples
    - mean: The mean of the distribution (default is 0)
    - std_dev: The standard deviation of the distribution (default is 1)
    
    Returns:
    - Bound: The Chebyshev bound for the probability that the deviation exceeds k.
    """
    # Chebyshev's inequality: P(|X - mean| >= k * std_dev) <= 1 / k^2
    bound = 1 / (k**2 * n)
    return bound

# Example usage:
k = 2        # Threshold (number of standard deviations)
n = 1000     # Number of samples
mean = 0     # Mean of the distribution
std_dev = 1  # Standard deviation of the distribution

bound = chebyshev_bound(k, n, mean, std_dev)
print(f"Chebyshev bound for P(|X - {mean}| >= {k} * {std_dev}): {bound:.5f}")


Chebyshev bound for P(|X - 0| >= 2 * 1): 0.00025


# Hoeffding's Bound
$$
P\left ( \left | \frac{1}{n} \sum_{i=1}^n X_i - \mathbb{E}(X_i) \right | > \epsilon \right ) \leq \frac{\mathbb{V}\left( \frac{1}{n} \sum_{i=1}^n X_i \right )}{\epsilon^2} = \frac{\mathbb{V}\left( X_0 \right )}{\epsilon^2 n}
$$


In [14]:
import numpy as np

def hoeffding_bound(epsilon, n, a, b):
    """
    Computes Hoeffding's bound for the probability that the sample mean 
    deviates from the expected value by more than epsilon.
    
    Parameters:
    - epsilon: The deviation threshold.
    - n: The number of samples.
    - a: The lower bound of the random variable.
    - b: The upper bound of the random variable.
    
    Returns:
    - The Hoeffding bound on the probability.
    """
    # Hoeffding's inequality formula: 2 * exp(-2 * n * epsilon^2 / (b - a)^2)
    bound = 2 * np.exp(-2 * n * epsilon**2 / (b - a)**2)
    return bound

# Example usage:
epsilon = 0.1      # Threshold for deviation
n = 1000           # Number of samples
a = 0              # Lower bound of the random variable
b = 1              # Upper bound of the random variable

bound = hoeffding_bound(epsilon, n, a, b)
print(f"Hoeffding's bound for P(|X_bar - E(X)| > {epsilon}): {bound:.5f}")


Hoeffding's bound for P(|X_bar - E(X)| > 0.1): 0.00000


We can use concentration as a measure of confidence in the following way. Consider $X_1,\ldots, X_n$ being i.i.d. sequence of Bernoulli($p$) for some unknown $p$. From the concept of concentration, we would expect that if we have many observations ($n$ large) we could use the empirical mean of the observations as a guess, but note that there is some variability as we saw in the above simulations. So what do we do? We use the concentration inequality to get information how far we can deviate from $p$ in the following way

$$
    P(\bar X_n - \mathbb{E}(\bar X_n) \geq \epsilon) \leq e^{-2n\epsilon^2}
$$

Since $\mathbb{E}(\bar X_n) = p$, rearrange and get

$$
    P(p \leq \bar X_n - \epsilon) \leq e^{-2n\epsilon^2}
$$

The complementary event thus satisfies

$$
    P(\bar X_n - \epsilon < p) \geq 1-e^{-2n\epsilon^2}
$$

We can do the same for the other side (see lecture notes) and we get

$$
    P(\bar X_n - \epsilon < p < \bar X_n + \epsilon) \geq 1-2 e^{-2n\epsilon^2}.
$$

If you where now asked to estimate $p$ using $n$ observations and give an interval where you with at least 95% confidence can say contains $p$, then you need to choose $\epsilon > 0$ such that

$$
    1-2 e^{-2n\epsilon^2} \geq 0.95.
$$

Smaller $\epsilon$ gives smaller intervals, so lets choose to have the smallest possible $\epsilon$ while still obaying the inequality above, i.e. we choose $\epsilon$ to solve

$$
    1-2 e^{-2n\epsilon^2} = 0.95.
$$

Rearranging we and taking log and then square root we obtain

$$
    \epsilon = \sqrt{-\frac{1}{2n}\ln\left(\frac{1-0.95}{2}\right)}
$$

In [15]:
import numpy as np

def compute_epsilon(n, confidence=0.95):
    """
    To estimate p with 95.0% confidence, choose epsilon = 0.02899

    Computes the value of epsilon for a given number of samples n and confidence level.
    
    Parameters:
    - n: The number of samples.
    - confidence: The desired confidence level (default is 0.95).
    
    Returns:
    - epsilon: The value of epsilon such that the interval [X_bar - epsilon, X_bar + epsilon] contains p with the given confidence.
    """
    # Solve for epsilon based on the desired confidence level
    epsilon = np.sqrt(-np.log((1 - confidence) / 2) / (2 * n))
    return epsilon
    #epsilon = np.sqrt(-1/(2*n)*np.log((1-0.95)/2))

# Example usage:
n = 1000  # Number of observations
confidence = 0.95  # Desired confidence level

epsilon = compute_epsilon(n, confidence)
print(f"To estimate p with {confidence*100}% confidence, choose epsilon = {epsilon:.5f}")


To estimate p with 95.0% confidence, choose epsilon = 0.04295


# Bennett's Inequality

In [16]:
import numpy as np

def bennetts_inequality(sum_of_means, n, a, b, epsilon):
    """
    Computes Bennett's inequality for the probability that the sum of independent random variables
    deviates from its expected value by more than epsilon.
    
    Parameters:
    - sum_of_means: The sum of the expected values of the individual random variables.
    - n: The number of variables.
    - a: The lower bound of the random variables.
    - b: The upper bound of the random variables.
    - epsilon: The deviation threshold.
    
    Returns:
    - The probability bound from Bennett's inequality.
    """
    # Compute the Bennett's inequality bound
    variance_sum = n * (b - a)**2
    bound = np.exp(- (n * epsilon**2) / (2 * variance_sum))
    
    return bound

# Example 
sum_of_means = 0  # The sum of the expected values of the random variables (E[X_i] for each i)
n = 1000          # Number of independent random variables
a = 0             # Lower bound of the random variables
b = 1             # Upper bound of the random variables
epsilon = 0.1     # Threshold deviation

probability_bound = bennetts_inequality(sum_of_means, n, a, b, epsilon)
print(f"Bennett's inequality bound for deviation > {epsilon}: {probability_bound:.5f}")


Bennett's inequality bound for deviation > 0.1: 0.99501


In [21]:
def bennett_epsilon(n,b,sigma,alpha):
    import scipy.optimize as so
    h = lambda u: (1+u)*np.log(1+u)-u
    f = lambda epsilon: np.exp(-n*sigma**2/b**2*h(b*epsilon/sigma**2))-alpha/2
    ans = so.fsolve(f,0.002)
    epsilon = np.abs(ans[0])
    print("Numerical error", f(epsilon))
    return epsilon


# Splitting data 

In [None]:
n_plus_m = len(X)
n = n_plus_m//2
m = n_plus_m-n
X_train = X[0:n,:]
Y_train = Y[0:n]
X_test = X[n:n+m,:]
Y_test = Y[n:n+m]