In [1]:
import random
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math

In [26]:
# In this file, we'll explore some of the ways that p-values can me misinterpreted.  
# Let's do linear regressions for random inputs and random outputs.

covariates = 15
samples = 100
tests = 1000
reject_counts = {i:0 for i in range(covariates + 1)}

for i in range(tests):
    X = np.array([[random.random() for i in range(covariates + 1)] for j in range(samples)])
    y = X[:,covariates]
    count = 0
    for j in range(covariates):
        X_constants = sm.add_constant(X[:,j])
        model_sm = sm.OLS(y, X_constants).fit()
        p_values = model_sm.pvalues
        if p_values[1] <= .05:
            count += 1

    reject_counts[count] += 1

print(reject_counts)

{0: 465, 1: 369, 2: 135, 3: 26, 4: 4, 5: 1, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}


The number associated to 0 is the number of times that the regressions correctly reject correlations amongst all variables. For 15 covariates, 100 samples, and 1000 tests, this should happen only about 463 times. So more than half of the time, we will be making false correlations, since we know everything is random!

In [27]:
# Computes probabilities assuming that the alpha error is precisely the chance of false correlation for each covariate.
# Reject_counts, covariates, and tests are defined in the previous cell.

probabilities = {}

for i in range(covariates + 1):
    prob = (math.factorial(covariates) / (math.factorial(i) * math.factorial(covariates-i))) * (.95)**(covariates-i) * (.05)**(i)
    probabilities[i] = round(prob, 3)

print(probabilities)

# Let's compare these probabilities to the probabilities we got experimentally.

for i in range(covariates + 1):
    reject_counts[i] = round(reject_counts[i]/tests, 3)

print(reject_counts)

{0: 0.463, 1: 0.366, 2: 0.135, 3: 0.031, 4: 0.005, 5: 0.001, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0}
{0: 0.465, 1: 0.369, 2: 0.135, 3: 0.026, 4: 0.004, 5: 0.001, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0}


As the number of tests is increased, the dictionary of reject_counts will approach the dictionary of probabilities.