In [15]:
SEED = 666

from random import normalvariate
from random import uniform
from math import sqrt

def generate_dataset(n, x_min, x_max, slope, intercept, e_mean, e_std):
    """
    Generates a dataset according to the input parameters
    
    Keyword Arguments:
    n:            number of observations
    x_min, x_max: limits of the preditor's range (x_min <= x <= x_max)
    slope:        slope of the true linear model
    intercept:    intercept of the true linear model
    e_mean:       error term mean
    e_std:        error term standard deviation
    
    Output:
    X: predictor values
    Y: response values
    """
    
    X = [uniform(x_min,x_max) for _ in range(n)]
    Y = [intercept + slope * x + normalvariate(e_mean,e_std) for x in X]
    
    return X, Y
    
def mean(L):
    """Returns the mean of the elements of a list."""
    return sum(L)/len(L)

def linear_regression(X, Y):
    """
    Simple linear regression Y on X.
    
    Keyword Arguments:
    X: the predictor
    Y: the response
    
    Output:
    slope:        slope of the line of best fit
    intercept:    intercept of the line of best fit
    RSS:          residual sum of squares of the model
    slope_SE:     standard error of the slope
    intercept_SE: standard error of the intercept
    slope_CI:     95% confidence interval of the slope
    intercept_CI: 95% condidence interval of the intercept
    t_statistic:  
    """
    
    n = len(X)
    X_mean, Y_mean = mean(X), mean(Y)
    
    slope_numerator   = sum([(x - X_mean) * (y - Y_mean) for x, y in zip(X,Y)])
    slope_denominator = sum([(x - X_mean)**2 for x in X])
    
    slope     = slope_numerator / slope_denominator
    intercept = Y_mean - slope * X_mean
    
    RSS = sum([(y - (intercept + slope * x))**2 for x, y in zip(X,Y)])
    RSE = sqrt(RSS / (n - 2))
    
    
    slope_SE     = sqrt((RSE**2) * ((1/n) + (X_mean**2 / slope_denominator)))
    intercept_SE = sqrt((RSE**2) / slope_denominator)
    
    slope_CI = [slope - 2 * slope_SE, slope + 2 * slope_SE]
    intercept_CI = [intercept - 2 * intercept_SE, intercept + 2 * intercept_SE]
    
    t_statistic = slope / slope_SE # bug when slope_SE = 0
 
    return slope, intercept, RSS, slope_SE, intercept_SE, slope_CI, intercept_CI, t_statistic

In [16]:
#linear_regression([0,1,2,3,4],[1,3,5,7,9])

X, Y = generate_dataset(10,-10,10,1,0,0,0.5)

linear_regression(X,Y)

(0.9703115377642096,
 0.25703931973716077,
 0.8150120618974365,
 0.10249384261342785,
 0.016385321343671346,
 [0.7653238525373539, 1.1752992229910653],
 [0.22426867704981807, 0.28980996242450346],
 9.46702273056438)

In [17]:
help(mean)
help(linear_regression)
help(generate_dataset)

Help on function mean in module __main__:

mean(L)
    Returns the mean of the elements of a list.

Help on function linear_regression in module __main__:

linear_regression(X, Y)
    Simple linear regression Y on X.
    
    Keyword Arguments:
    X: the predictor
    Y: the response
    
    Output:
    slope:        slope of the line of best fit
    intercept:    intercept of the line of best fit
    RSS:          residual sum of squares of the model
    slope_SE:     standard error of the slope
    intercept_SE: standard error of the intercept
    slope_CI:     95% confidence interval of the slope
    intercept_CI: 95% condidence interval of the intercept
    t_statistic:

Help on function generate_dataset in module __main__:

generate_dataset(n, x_min, x_max, slope, intercept, e_mean, e_std)
    Generates a dataset according to the input parameters
    
    Keyword Arguments:
    n:            number of observations
    x_min, x_max: limits of the preditor's range (x_min <= x <= x