# 📊 Day 6: Statistics & Regression Notebook

**Topics Covered:**
- Confidence Intervals
- Sampling Distributions
- Regression Basics

In [None]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression

## Confidence Interval Comparison: Smokers vs Non-Smokers

In [None]:
# Simulated data
np.random.seed(42)
smokers = np.random.normal(loc=160, scale=10, size=30)
non_smokers = np.random.normal(loc=155, scale=12, size=30)

# CI function
def confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    sem = stats.sem(data)
    margin = sem * stats.t.ppf((1 + confidence) / 2, len(data)-1)
    return (mean - margin, mean + margin)

ci_smokers = confidence_interval(smokers)
ci_non_smokers = confidence_interval(non_smokers)
print(f"Smokers CI: {ci_smokers}\nNon-Smokers CI: {ci_non_smokers}")

## Visualizing Sampling Distribution

In [None]:
# Central Limit Theorem demo
sample_means = [np.mean(np.random.choice(smokers, size=10)) for _ in range(1000)]
sns.histplot(sample_means, kde=True)
plt.title("Sampling Distribution of Smokers (n=10)")
plt.xlabel("Sample Mean")
plt.show()

## Simple Linear Regression

In [None]:
# Sample data
X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
y = np.array([2, 4, 5, 4, 5])

model = LinearRegression()
model.fit(X, y)

print(f"Coefficient: {model.coef_[0]}")
print(f"Intercept: {model.intercept_}")

plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X), color='red')
plt.title("Simple Linear Regression")
plt.xlabel("X")
plt.ylabel("y")
plt.show()