## Confidence Intervals
> Do Wikipedia biographies contain more images than general articles?

In [3]:
import requests
import math
import time

# SETTINGS
N = 20
CONF_LEVEL = 0.95
WIKI_API_URL = "https://en.wikipedia.org/api/rest_v1/page/random/summary"

# z critical values
Z = {
    0.90: 1.645,
    0.95: 1.96,
    0.99: 2.576
}
z_star = Z[CONF_LEVEL]

HEADERS = {
    "User-Agent": "StudentResearchProject/1.0 (contact@example.com)"
}

def get_random_page():
    """Fetch a random page from Wikipedia with error handling."""
    try:
        r = requests.get(WIKI_API_URL, headers=HEADERS)

        if r.status_code != 200:
            print(f"HTTP error: {r.status_code}")
            print("Response text:", r.text)
            return None

        try:
            return r.json()
        except Exception:
            print("JSON decode failed. Raw response:")
            print(r.text)
            return None

    except Exception as e:
        print("Request failed:", e)
        return None


def collect_sample(target, require_bio):
    successes = 0
    failures = 0
    collected = 0

    while collected < target:
        data = get_random_page()
        if data is None:
            time.sleep(0.3)
            continue

        desc = data.get("description", "") or ""
        has_image = data.get("thumbnail") is not None

        is_bio = (
            "writer" in desc.lower() or
            "politician" in desc.lower() or
            "singer" in desc.lower() or
            "actor" in desc.lower() or
            "athlete" in desc.lower() or
            "footballer" in desc.lower() or
            "person" in desc.lower()
        )

        if require_bio and not is_bio:
            continue
        if not require_bio and is_bio:
            continue

        if has_image:
            successes += 1
        else:
            failures += 1

        collected += 1
        time.sleep(0.1)

    return successes, failures


def compute_ci(successes, total):
    p_hat = successes / total
    SE = math.sqrt(p_hat * (1 - p_hat) / total)
    ME = z_star * SE
    return p_hat, SE, ME, p_hat - ME, p_hat + ME


bio_success, bio_fail = collect_sample(N, require_bio=True)
gen_success, gen_fail = collect_sample(N, require_bio=False)

bio_stats = compute_ci(bio_success, N)
gen_stats = compute_ci(gen_success, N)

print("\nBIOGRAPHIES")
print(f"Sample size: {N}")
print(f"With images: {bio_success}")
print(f"Without images: {bio_fail}")
print(f"p̂: {bio_stats[0]:.4f}")
print(f"SE: {bio_stats[1]:.4f}")
print(f"ME: {bio_stats[2]:.4f}")
print(f"{int(CONF_LEVEL*100)}% CI: ({bio_stats[3]:.4f}, {bio_stats[4]:.4f})")

print("\nGENERAL ARTICLES")
print(f"Sample size: {N}")
print(f"With images: {gen_success}")
print(f"Without images: {gen_fail}")
print(f"p̂: {gen_stats[0]:.4f}")
print(f"SE: {gen_stats[1]:.4f}")
print(f"ME: {gen_stats[2]:.4f}")
print(f"{int(CONF_LEVEL*100)}% CI: ({gen_stats[3]:.4f}, {gen_stats[4]:.4f})")

print("\nCondition checks:")
print("Random: using Wikipedia API random endpoint")
print("Independence: sample << population")
print(f"Bio normality: successes >= 10? {bio_success >= 10}, failures >= 10? {bio_fail >= 10}")
print(f"Gen normality: successes >= 10? {gen_success >= 10}, failures >= 10? {gen_fail >= 10}")


BIOGRAPHIES
Sample size: 20
With images: 20
Without images: 0
p̂: 1.0000
SE: 0.0000
ME: 0.0000
95% CI: (1.0000, 1.0000)

GENERAL ARTICLES
Sample size: 20
With images: 20
Without images: 0
p̂: 1.0000
SE: 0.0000
ME: 0.0000
95% CI: (1.0000, 1.0000)

Condition checks:
Random: using Wikipedia API random endpoint
Independence: sample << population
Bio normality: successes >= 10? True, failures >= 10? False
Gen normality: successes >= 10? True, failures >= 10? False


## Conclusion
> Since I was being rate limited, I had no choice but to set the sample size to 20. Given the results, all of those biographies and articles contain images. I am pretty sure CI means confidence interval so.. The confidence interval for both of them is 95%. The biography normality success rate was 10 along with the failures. The mean margin of error is 10%