In [7]:
import numpy as np
from scipy import stats 

# Statistical Distributions

Here we we'll review the most important Statistical Distributions for hypothesis
testing.

## SciPy
To perform Hypothesis Testing during could interviews, we would ideally be able 
to import **stats** function from **scipy** libraries to calculate de PDFs of
the distributions. 

## Key Distributions and Use Cases

### Normal distribution (stats.norm)
* Used for z-tests or when sample means are normally distributed (Central Limit Theorem)
- `stats.norm.cdf(x, loc=mean, scale=std_dev)`

### t (stats.t)
* Used for one-sample, two-sample, and paired t-tests. The t-distribution is used
when the sample size is small (n<30) or if the population variance is unknown.
- `stats.t.cdf(x, df)` where
    * $df: Degrees of freedom.

### Chi-Squared (stats.chi2)
* Used to test whether observed data follows an expected distribution (goodness of fit) or to assess independence between categorical variables (e.g., contingency tables).
- `stats.chi2.cdf(x,df)`


### Binomial (stats.binom)
* Useful when modeling **binary events** with fixed probabilities over a number of independent trials.
- `stats.binom.cdf(k,n,p)`  where
    * $n$: Number of trias
    * $p$: probability of success in each trial
    * $k$: Number of observed successes

### Poison Distribution (stats.poison)
* Model counts of events occurring over a fixed period or space, such as the 
number of website visits in an hour.
- `stats.poisson.cdf(k,mu)` where
    * $k$:Observed count
    $ $\mu$: Expected count (mean of the distribution)


# One-Sample T-Test
The one-sample t-test tests whether the mean of a sample differs significantly 
from a known or hypothesized population mean. It assumes the data is normally 
distributed.

* $H_0$ is that the sample mean $\bar{x}$ is equal to the population mean $\mu$.
* $H_1$:   $\bar{x} \neq \mu$

Test statistic:

$$ t = \frac{\bar{x}-\mu}{s/\sqrt{n} }$$

In [160]:
from scipy import stats
class TTest:

    def __init__(self):
        self.t_stat = None
        self.p_value = None
    
    def one_sample_t_test(self, sample, mu, method = 'analytical') -> float:
        """
        Perform a one-sample t-test to check if the sample mean is significantly
        different from the population mean

        Parameters:
            sample: array-like. 
                The sample data.
            mu: float.
                The population mean, 
            method: string.
                Either 'analytical' or 'monte_carlo' 

        Returns:
            t-stat: float.
                The estimated t-statistic
            p-value: float.
                The estimated p-value for the t-test. This is the probability of 
                obtaining test results *at least as extreme* as the observed in
                the data
        """
        if method not in ['analytical', 'monte_carlo']:
            raise ValueError("method bust be one of ['analytical', 'monte_carlo']")

        # GUarantee that sample is a numpy array
        X = np.array(sample)

        # Sample size n and degrees of freedom df
        n = len(X)
        df = n-1
        
        # Compute sample statistics 
        X_bar = np.mean(X)  # Sample mean
        X_std = np.std(X, ddof=1)  # Unbiased sample standard deviation

        # t-statistic
        self.t_stat = (X_bar - mu)/(X_std/(n**0.5))
        
        # Two-Tailed P-Value 

        if method == 'analytical':
            self.p_value = 2 * (1 - stats.t.cdf( abs(self.t_stat), df)   )
        else:
            self.p_value = self.monte_carlo_t_test( mu, X_std, n, n_simulations = 10**5)

        return self.t_stat, self.p_value

        # Monte-Carlo
    def monte_carlo_t_test(self, mu, X_std, n, n_simulations = 10**5):
        """
        Monte Carlo simulation for one-sample t-test p-value estimation.

        Parameters:
            mu: float.
                The population mean.
            X_std: float.
                The sample standard deviation.
            n: int.
                The sample size.
            n_simulations: int.
                The number of Monte Carlo simulations.

        Returns:
            p-value: float.
                The estimated p-value based on the Monte Carlo simulation.
        """
        counter = 0
        
        for _ in range(n_simulations):
            random_sample = np.random.normal(mu, X_std, n )
            sample_bar = np.mean(random_sample)  #Sample mean
            sample_std = np.std(random_sample, ddof =1)
            t_sample =  (sample_bar - mu)/(sample_std/(n**0.5) )

            if abs(t_sample) >=  abs(self.t_stat):
                counter += 1

        # Return estimated p-value
        return counter/n_simulations

In [162]:
# Generate sample data
np.random.seed(42)  # Ensure reproducibility
sample_data = np.random.normal(loc=5, scale=2, size=30)  # Sample from N(5, 2^2)
mu_population = 5  # Population mean to compare

# 1. Use your TTest class
ttest = TTest()
my_t_stat, my_p_value = ttest.one_sample_t_test(sample_data, mu_population,method = 'monte_carlo')

# 2. Use scipy's t-test for comparison
scipy_t_stat, scipy_p_value = stats.ttest_1samp(sample_data, popmean=mu_population)

# Print the results
print(f"My Implementation - t-statistic: {my_t_stat:.5f}, p-value: {my_p_value:.5f}")
print(f"Scipy Implementation - t-statistic: {scipy_t_stat:.5f}, p-value: {scipy_p_value:.5f}")

My Implementation - t-statistic: -1.14502, p-value: 0.26160
Scipy Implementation - t-statistic: -1.14502, p-value: 0.26156


In [None]:
sample = [2.5, 3.0, 2.8, 3.5, 3.1, 2.9, 3.0]
ttest = TTest()
t_stat, p_val = ttest.one_sample_t_test(sample, mu=3.0)
print("T-Statistic:", t_stat)
print("P-Value:", p_val)

# Two-Sample T-Test (t-test for independent samples)
The two-sample t-test compares the means of two independent samples to determine 
if they come from populations with the same mean. It assumes that both samples 
are normally distributed. 

The hypotheses are:

* **Null Hypothesis**        $H_0$: $\bar{X_1} = \bar{X_2}$
* **Alternative Hypothesis** $H_1$: $\bar{X_1} \neq \bar{X_2}$

There are **two main versions**:

### 1. Pooled Variance
This version assumes that both groups comes from  **populations with the same variance**.

Test statistic:

$$ t = \frac{\bar{X_1}-\bar{X_2}}{ \sqrt{s_p^2 (\frac{1}{n_1} + \frac{1}{n_2} )} }$$

where the **pooled variance** $s_p^2$ is the weighted average of the variances: 

$$ s_p^2 = \frac{(n_1 -1)  s_1^2 + (n_2 -1)  s_2^2 }{n_1 +n_2 -2} $$

The weights are given by the degrees of freedom of each sample, ensuring that larger samples have more influence on the variance estimation:

Sample Variance Formula:
$$s_1^2 = \frac{1}{n_1-1} \sum_{i=1}^{n_1}  (x_{1i} -\bar{x_1})^2$$

The **degrees of freedom** for the test is $$df = n_1+n_2-2$$

#### Why Pooled Variance?


* Even though we assume that both samples are drawn from populations with the **same variance**, the **sample variances** may differ due to randomness. 

* The pooled variance s_p^2 provides a **more accurate estimate** of the true population variance by **combining information from both samples**. 

* The **weights based on degrees of freedom** ensure that larger samples influence the pooled variance more.

### 2.Unequal Variance (Welch's t-tsets)

The Welch's does not assume equal population variances. It is more robust when variances differ between the two samples.

Test Statistics: 

$$ t_{\text{Welch}} = \frac{\bar{X_1}-\bar{X_2}}{  \sqrt{ \frac{s_1^2}{n_1} + \frac{s_2^2}{n_2} }  }$$

with the degrees of freedom defined by the equation (*forget about it...*)

$$ df = \frac{ (s_1^2/n_1 ) + (s_2^2/n_2) ^2 }{\frac{1}{n_1-1} (\frac{s_1^2}{n_1})^2   + \frac{1}{n_2-1} (\frac{s_2^2}{n_2})^2 }

In [143]:
class TTest:

    
    def two_sample_t_test_monte_carlo(self, sample_1: list, sample_2: list) -> float:
        """
        Perform a two-sample t-test to check if the sample mean is significantly
        different from the population mean.
        Assumption: the samples are drawn from populations with the same variance.

        Parameters:
            sample_1: array-like. 
                The first sample data.
            sample_2: array-like. 
                The second sample data.
            mu: float.
                The population mean, 

        Returns:
            t-stat: float.
                The estimated t-statistic
            p-value: float.
                The estimated p-value for the t-test. This is the probability of 
                obtaining test results *at least as extreme* as the observed in
                the data.
        """

        # GUarantee that sample is a numpy array
        X1= np.array(sample_1)
        X2= np.array(sample_2)

        # Sample size n and degrees of freedom df
        n1, n2 = len(X1), len(X2)
        df = n1 + n2 -2
        
        # Compute sample statistics 
        X1_bar = sum(X1)/n1                  #Sample mean
        X2_bar = sum(X2)/n2                  #Sample mean

        X1_var = sum( (X1-X1_bar)**2 )/(n1-1) # Sample variance
        X2_var = sum( (X2-X2_bar)**2 )/(n2-1) # variance

        pooled_variance = ( (n1-1)*X1_var + (n2-1)*X2_var ) / df
        weighted_average = ( n1*X1_bar + n2*X2_bar ) / (n1+n2)
        # t-statistic
        denominator =  (pooled_variance* (1/n1 + 1/n2)) **(1/2)
        self.t_stat = (X1_bar - X2_bar)/denominator


        # Monte Carlo
        n_simulations = 10**5
        count = 0

        for _ in range(n_simulations):

            # Assuming X1_bar is the mean of X2
            sample_X1 = np.random.normal(weighted_average, pooled_variance**0.5, n1)
            sample_X2 = np.random.normal(weighted_average, pooled_variance**0.5, n2)

            sample_X1_bar = np.sum(sample_X1)/(n1)
            sample_X2_bar = np.sum(sample_X2)/(n2)  

            sample_test = (sample_X1_bar - sample_X2_bar) / denominator

            if abs(sample_test) > abs(self.t_stat):
                count += 1

        # p-value 2---
        self.p_value = count/n_simulations

        return self.t_stat, self.p_value

In [148]:
# Example data
sample1 = [68, 78, 74, 72, 77]
sample2 = [68, 65, 70, 67, 69]

# Perform the t-test
ttest = TTest()
t_stat, p_val = ttest.two_sample_t_test_monte_carlo(sample1, sample2)
print(f"T-Statistic: {t_stat}, P-Value: {p_val}")

# 1. Scipy implementation of t-test
scipy_t, scipy_p = stats.ttest_ind(sample1, sample2)
print(f"Scipy t-statistic: {scipy_t:.5f}, p-value: {scipy_p:.5f}")

T-Statistic: 3.715152686079336, P-Value: 0.00016
Scipy t-statistic: 3.71515, p-value: 0.00023


In [10]:
from scipy import stats 
class TTest:

    
    def two_sample_t_test(self, sample_1: list, sample_2: list) -> float:
        """
        Perform a two-sample t-test to check if the sample mean is significantly
        different from the population mean.
        Assumption: the samples are drawn from populations with the same variance.

        Parameters:
            sample_1: array-like. 
                The first sample data.
            sample_2: array-like. 
                The second sample data.
            mu: float.
                The population mean, 

        Returns:
            t-stat: float.
                The estimated t-statistic
            p-value: float.
                The estimated p-value for the t-test. This is the probability of 
                obtaining test results *at least as extreme* as the observed in
                the data.
        """

        # GUarantee that sample is a numpy array
        X1= np.array(sample_1)
        X2= np.array(sample_2)

        # Sample size n and degrees of freedom df
        n1, n2 = len(X1), len(X2)
        df = n1 + n2 -2
        
        # Compute sample statistics 
        X1_bar = sum(X1)/n1                  #Sample mean
        X2_bar = sum(X2)/n2                  #Sample mean

        X1_var = sum( (X1-X1_bar)**2 )/(n1-1) # Sample variance
        X2_var = sum( (X2-X2_bar)**2 )/(n2-1) # variance

        pooled_variance = ( (n1-1)*X1_var + (n2-1)*X2_var ) / df

        # t-statistic
        denominator =  (pooled_variance* (1/n1 + 1/n2)) **(1/2)
        self.t_stat = (X1_bar - X2_bar)/denominator
        # p-value 2---
        self.p_value = 2 * (1 - stats.t.cdf( abs(self.t_stat), df)   )

        return self.t_stat, self.p_value

In [None]:
# Example data
sample1 = [75, 78, 74, 72, 77]
sample2 = [68, 65, 70, 67, 69]

# Perform the t-test
ttest = TTest()
t_stat, p_val = ttest.two_sample_t_test(sample1, sample2)
print(f"T-Statistic: {t_stat}, P-Value: {p_val}")

# Anova (Analysis of Variance)
See docs: https://docs.google.com/document/d/13b2W1HfUgijqNlsQ_3fQgJ6pIyJTJ5UdjuOSy2BXNW8/edit?usp=sharing

ANOVA is used to determine if there are any statistically significant difference
between the meaans of three or more groups. It compares the 
**variance between groups** with the **variance within groups** to check if at 
least one group's mean is different

* $H_0$: All groups means are equal: $\bar{x_1} = \bar{x_2} = ...= \bar{x_n}$
* $H_1$: Att least one group mean is differente.


The test statistic is the F-Statistic $$ \frac{SSB/(k-1)}{SSW/(n-k)}   $$

where $k$ is the number of groups, $n$ is the total number of observations 
across  all groups.  

$SSB$ is the **Between-groups sum of squares**,  
            $$ SSB = \sum_{i=1:k} n_i (\bar{x_i} -\bar{x})^2  $$ 

where
 * $n_i$ is the number of observations in group $i$ 
 * $\bar{x_i}$: mean of group $i$ 
 * $\bar{x}$: overall mean across all groups

 and  SSW is the **Within groups sum of squares**,
        $$ SSW = \sum_{i=1:k} \sum_{j \text{in group i}} (x_{ij} - \bar{x_i})^2   $$
    
where $x_{ij}$ is observation $j$ in group $i$. 


In [12]:
import numpy as np
from scipy import stats

# The implementation comes from chatGPT. It is currently too hard for me. I suggest you skip it.
class ANOVA:
    def __init__(self):
        self.f_statistic = None
        self.p_value = None

    def one_way_anova(self, *groups):
        """
        Perform one-way ANOVA to test whether the means of multiple groups are equal.

        Parameters:
        *groups : array-like
            Each argument represents a group of sample data.

        Returns:
        f_statistic : float
            The computed F-statistic.
        p_value : float
            The p-value for the ANOVA test.
        """
        # Calculate the total number of observations and number of groups
        n_total = sum([len(group) for group in groups])
        k = len(groups)

        # Calculate the grand mean
        grand_mean = np.mean([x for group in groups for x in group])

        # Calculate Between-group sum of squares (SSB)
        ssb = sum([len(group) * (np.mean(group) - grand_mean) ** 2 for group in groups])

        # Calculate Within-group sum of squares (SSW)
        ssw = sum([sum((x - np.mean(group)) ** 2 for x in group) for group in groups])

        # Degrees of freedom
        df_between = k - 1
        df_within = n_total - k

        # Mean squares
        ms_between = ssb / df_between
        ms_within = ssw / df_within

        # F-statistic
        self.f_statistic = ms_between / ms_within

        # P-value (using F-distribution)
        self.p_value = 1 - stats.f.cdf(self.f_statistic, df_between, df_within)

        return self.f_statistic, self.p_value

# Chi-Squared Test 

The Chi-squared tests are a family of statistical tests used for categorical data
analysis. They evaluate the **association** or **independence** between categorical
variables, or how well the observed data fits an expected distribution.

The Chi-Squared tests are **non-parametric**, meaning that they don't assume the data follows a specific distribution,

The **Goodness of Fit** test checks how well an observed distribution matches an 
expected one, while the **Independence Test" evaluates whethet two categorical 
variables are independent. 

There are 3 main types of Chi-squared tests, but since 2 are very similar, I'm 
gonna treat it as a subcase.



#### Test Statististic

All Chi-squared test statistics are the given by:

$$  \chi^2 = \sum \frac{(O_i-E_i)^2}{E_i} $$

where 
* $O_i$ is the Observed frequency (from sample)
* $E_i$ is the Expected frequency 



### 1. Goodness of Fit

Determines if a sample matches a specific distribution. It is used when there is 
**one categorical variable** and we want to see if the observed data fits a know
distribution. For example, we can test if a dice is fair.

#### Hypotheses:

* $H_0$: The observed frequencies follow the specified distribution
* $H_1$: The observed frequencies **do not** follow the specified distribution

Example:

* $H_0$: The die is fair (each outcome has equal probability).
* $H_1$: The die is biased (at least one outcome has a different probability).


### Python Implementation

In [13]:
class ChiSquaredTests:
    def chi_squared_goodness_of_fit(self, observed, expected):
        """
        Perform a chi-squared goodness of fit test.
        -----------
        Parameters
        -----------
        observed: array-like.
            The observed frequencies.
        expected: array-like,
            The expected frequency.
        ----------
        Returns
        ----------
        chi2_stat: float
            The computed chi-squared statistic.
        p-value: float
            The estimated p-value for the test. This is the probability of 
            obtaining test results *at least as extreme* as the observed in
            the data.
        """

        #Guarantees that data is a numpy array
        observed = np.array(observed)
        expected = np.array(expected)
    
        if len(observed) != len(expected):
            raise ValueError("Observed and expected arrays must have the same length.")
        if np.any(observed < 0) or np.any(expected < 0):
            raise ValueError("Observed and expected frequencies must be non-negative.")

        # Avoid division by zero by adding a small epsilon
        epsilon = 1e-10
        expected = np.where(expected == 0, epsilon, expected)

        # Compute test statistic and degrees of freedom (df)
        chi2_stat =  sum( ((observed - expected)**2)/expected )
        df = len(observed) -1 

        p_value = 1 - stats.chi2.cdf(chi2_stat, df  )

        return chi2_stat, p_value
        

### Example usage

In [None]:
# Observed frequencies of a dice roll
observed = [16, 18, 16, 14, 12, 24]

# Expected frequencies if the die is fair
expected = [100 / 6] * 6

# Instantiate the ChiSquaredTest class
chi_test = ChiSquaredTests()

# Perform the chi-squared goodness of fit test
chi2_stat, p_value = chi_test.chi_squared_goodness_of_fit(observed, expected)

print(f"Chi-Squared Statistic: {chi2_stat.round(3)}")
print(f"P-Value: {p_value.round(3)}")


### Interpretation

Since p-value is high, we do not reject the null hypothesis of a fair dice.

# Note on the Chi-Squared Distribution

Bellow are the PDF (left) and CDF (right) for the Chi-Squared distribution with
different degrees of freedom. 

<img src="../images/chi_squared_pdf.png" alt="left" width="450"/>

<img src="../images/chi_squared_cdf.png" alt="right" width="450"/>

Since the $\chi^2$ statistic is always non-negative, the **distribution is inherently one-sided**, focusing only on positive deviations from the expected values. 

Thus, the p-value for the Chi-Squared test is calculated using **only the right tail** of the distribution:
$$ \text{p-value} = 1 - CDF(\chi^2, df) $$

### Comparison with the T-Test

In contrast, the **t-statistic** can be **positive or negative**, requiring a two-tailed test. Thus, the p-value calculation for a two-tailed t-test is:

$$ \text{p-value} =2 * (1 - CDF(|t|), df)) $$

Since $(1 - CDF(|t|), df)$ gives the right-tail probability of observing a value
greater that $|t|$, we need to multiply it by to to account for the left side.

# Monte Carlo for p-value Approximation
Using Monte Carlo simulation is a practical alternative to compute the p-value 
if statistical libraries like scipy are restricted. This method is intuitive, demonstrates strong statistical reasoning, and aligns well with the 
problem-solving mindset that Google interviews often look for.

## How it works

1. Compute the **test statistic** for the observed data.

2. Generate **simulated data** using a known distribution.

3. Count how many simulated statistics are *as extreme* (greater or equal) to the observed test statistic.

4. Estimate de **p-value**  as the proportion of such extreme results.

In [95]:
class ChiSquaredTests:

    def __init__(self):
        self.expected = None
        self.observed_chi2_stat = None



    def chi_squared_goodness_of_fit(self, observed, expected):
        """
        Perform a chi-squared goodness of fit test using Monte Carlo Simulation.
        -----------
        Parameters
        -----------
        observed: array-like.
            The observed frequencies.
        expected: array-like,
            The expected frequency.
        ----------
        Returns
        ----------
        chi2_stat: float
            The computed chi-squared statistic.
        p-value: float
            The estimated p-value for the test. This is the probability of 
            obtaining test results *at least as extreme* as the observed in
            the data.
        """

        #Guarantees that data is a numpy array
        observed = np.array(observed)
        self.expected = np.array(expected)

        # Compute test statisticfor the observed data
        self.observed_chi2_stat =  sum( ((observed - self.expected)**2)/self.expected )
        
        p_value = self.monte_carlo_chi2(n_simulations = 10*12)

        return self.observed_chi2_stat, p_value


    def monte_carlo_chi2(self, n_simulations = 10000):

        # Monte Carlo counter to estimate the p-value
        more_extreme_count = 0

        # Define the corresponding observations (can be any labels or indices)
        labels = np.arange(len(self.expected))
        probs = self.expected / np.sum(self.expected)   #normalized probabilities
        for _ in range(n_simulations):    
            
            n_samples = np.sum(self.expected).astype(int)
            simulated_outcomes = np.random.choice( labels, 
                                                   size = n_samples, 
                                                   p = probs)
    
            simulated_freq = np.bincount(simulated_outcomes, minlength=len(self.expected))

            # Compute chi-squared statistic for simulated data
            simulated_chi2_stat = np.sum((simulated_freq - self.expected ) ** 2 / (self.expected ))
            if simulated_chi2_stat >= self.observed_chi2_stat:
                more_extreme_count += 1

        p_value = more_extreme_count/n_simulations

        return p_value
        

In [None]:
import numpy as np
from scipy import stats

# Example usage
observed = [8, 18, 16, 14, 12, 24]
expected = [100 / 6] * 6

# Adjust expected frequencies to match the sum of observed frequencies
expected = np.array(expected) * (np.sum(observed) / np.sum(expected))

# Instantiate and run Monte Carlo chi-squared test
chi_test = ChiSquaredTests()
chi2_stat, monte_carlo_p_value = chi_test.chi_squared_goodness_of_fit(observed, expected)

print(f"Monte Carlo p-value: {monte_carlo_p_value:.5f}")

# Compute chi-squared test using scipy.stats
scipy_chi2_stat, scipy_p_value = stats.chisquare(f_obs=observed, f_exp=expected)

print(f"Scipy chi-squared statistic: {scipy_chi2_stat:.3f}")
print(f"Scipy p-value: {scipy_p_value:.5f}")

# Compare results
print("\nComparison:")
print(f"Monte Carlo Chi-squared Statistic: {chi2_stat:.3f}")
print(f"Monte Carlo p-value: {monte_carlo_p_value:.5f}")
print(f"Scipy Chi-squared Statistic: {scipy_chi2_stat:.3f}")
print(f"Scipy p-value: {scipy_p_value:.5f}")

In [None]:
expected

In [None]:
np.random.choice(observations,size= 10, p=frequencies)

In [None]:
expected = [100 / 6] * 6
n_samples = 100
labels = np.arange(len(expected))
simulated_outcomes = np.random.choice(labels, size=n_samples, p=expected )


In [165]:
75+(1.96*2)


78.92