# Linear Regression

## $$\mathcal{L}(\beta) = \sum_{i=1}^n \epsilon_i^2 =  (y-X\beta)^T(y-X\beta)$$

In [1]:
from sklearn.datasets import make_regression, load_boston

In [2]:
cd ..

/home/jovyan/Week_9/cap


In [3]:
%run lib/imports.py

%matplotlib inline

In [6]:
# %run lib/models/Linear_Regression.py

In [3]:
from lib.models.Linear_Regression import OLS 

## Using make_regression to create a simple dataset to test the linear regression model on.

In [4]:
mr_X, mr_y = make_regression(n_samples=100, n_features = 100, n_informative = 10, random_state = 0, noise = 1)

### Visualizing data

In [None]:
fig = plt.figure(figsize=(10,20))

for i in range(X.shape[1]):
    
    fig.add_subplot(20,5,i+1)
    sns.distplot(X[i])
    plt.title(i+1)
    plt.yscale('linear')
    
fig.tight_layout()

In [None]:
fig = plt.figure(figsize=(10,20))

for i in range(X.shape[1]):

    fig.add_subplot(20,5,i+1)
    plt.scatter(X[i], y)
    plt.title(i+1)
    
fig.tight_layout()

## Using the Boston housing dataset from Sklearn

In [None]:
boston_data = load_boston()

In [None]:
boston_data.feature_names

In [None]:
# remove when done
print(boston_data.DESCR)

### Features


| Name   | Description                                                           | Type      |
| ------ |:--------------------------------------------------------------------- | --------- |
|CRIM    | per capita crime rate by town                                         |Continuous |                   
|ZN      | proportion of residential land zoned for lots over 25,000 sq.ft.      |Continuous |
|INDUS   | proportion of non-retail business acres per town                      |Continuous |
|CHAS    | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) |Categorical|
|NOX     | nitric oxides concentration (parts per 10 million)                    |Continuous |
|RM      | average number of rooms per dwelling                                  |Categorical|
|AGE     | proportion of owner-occupied units built prior to 1940                |Continuous |
|DIS     | weighted distances to five Boston employment centres                  |Continuous |
|RAD     | index of accessibility to radial highways                             |Categorical|
|TAX     | full-value property-tax rate per \$10,000                             |Continuous |
|PTRATIO | pupil-teacher ratio by town                                           |Continuous |
|B       | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town        |Continuous |
|LSTAT   | % lower status of the population                                      |Continuous |
|MEDV    | Median value of owner-occupied homes in $1000's                       |Continuous |



#### Feature notes:

* Radial highway - An arterial highway leading to or from an urban center.

In [None]:
# changing some column names for added clarity
# abbreviations and acronyms will be left capitalized
columns = ['crime',
           'zone',
           'NRB_acres',
           'on_river',
           'NOX',
           'rooms',
           'age',
           'DIS',
           'highway',
           'tax',
           'PT_ratio'
           'B',
           'LSTAT',
           'MED_value']

In [None]:
bos_X = pd.DataFrame(boston_data.data, columns=columns)
bos_y = pd.DataFrame(boston_data.target)

In [None]:
bos_X.shape, bos_y.shape

In [None]:
bos_X.head()

In [None]:
bos_y.he

In [None]:
fig = plt.figure(figsize=(15,10))

i = 0
for col in bos_X.columns:
    i += 1
    fig.add_subplot(4,4,i)
    sns.distplot(bos_X[col])
    plt.title(col)

plt.tight_layout()

In [None]:
sns.pairplot(bos_X)

## Comparing custom linear regression model to the one in scikit learn

#### make_regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import numpy as np

In [None]:
mr_X_tr, mr_X_ts, mr_y_tr, mr_y_ts = train_test_split(mr_X, mr_y)

In [None]:
# from lib.models.Linear_Regression import OLS 
sci_lr = LinearRegression()
my_lr = OLS()


In [None]:
sci_lr.fit(mr_X_tr, mr_y_tr)
sci_lr.score(mr_X_ts, mr_y_ts)

In [None]:
my_lr.fit(mr_X_tr, mr_y_tr, reg=True)
my_lr.score(mr_X_ts, mr_y_ts)

In [None]:
np.eye(1)

In [None]:
class OLS():
    
    def __init__(self):
        pass
    
    def fit(self, X=None, y=None, reg=False, alpha=None, deg_freedom=None):
        import numpy as np
        '''
        Creates a line of best fit for the data using linear regression.
    
        Parameters:

        reg: regularization
        
        '''
        
        ones = np.ones(X.shape[0])
        X_ = np.column_stack((ones, X))
        
        
        if reg == False:
            self.betas = np.linalg.inv(X_.T.dot(X_)).dot(X_.T).dot(y)
            
        if reg == True:
            self.betas = np.linalg.inv(X_.T.dot(X_) + alpha * np.eye(deg_freedom)).dot(X_.T).dot(y)
            
        return self.betas

#use this score function
    def score(self, X=None, y=None):
    
        y_pred = self.betas[0] + X.dot(self.betas[1:])
    
        RSS = ((y_pred - y) ** 2).sum()
        TSS = ((y - y.mean()) ** 2).sum()
    
        R2 = 1 - (RSS/TSS)

        return R2

my_lr = OLS()

In [None]:
from lib.models.Linear_Regression import OLS 

In [None]:
fig = plt.figure(figsize=(10,20))

for i in range(X.shape[1]):
    
    fig.add_subplot(20,5,i+1)
    sns.distplot(X[i])
    plt.title(i+1)
    plt.yscale('linear')
    
fig.tight_layout()

fig = plt.figure(figsize=(10,20))



for i in range(X.shape[1]):

    fig.add_subplot(20,5,i+1)
    plt.scatter(X[i], y)
    plt.title(i+1)
    
fig.tight_layout()

In [None]:
def fit_ridge(self, X=None, y=None, alpha=1):
    '''
    fits linear regression model with the ridge penalty
    '''
    ones = np.ones(X.shape[0])
    X_ = np.column_stack((ones, X))

    betas = np.linalg.inv(X_.T.dot(X_) + alpha * np.eye(1)).dot(X_.T).dot(y)

    return self