# Models from Scratch: Linear Regression 
## *Implementation*
***

In [19]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

In [20]:
class LinearRegression():

    def __init__(self, X, y):
        """ Initialize a Linear Regression object
        
            Arguments:
            X -- Design matrix (Pandas dataframe, numpy array, 2D list)
            y -- Vector of response variable (Pandas series, numpy array, list)
        """
        if len(X) != len(y):
            raise ValueError("Dimensional error: {} != {}".format(len(X), len(y)))
        
        self.X = np.asarray(X)
        self.y = np.asarray(y)
        self.n = len(X)  # Number of observations
        self.p = len(X[0])  # Number of predictors
        self.coefficients = None  # Coefficients to be estimated
    
    def RSS(self, b):
        """Residual sum of squares as a function of the beta vector (used for estimation)"""
        return np.linalg.norm(self.y - np.dot(self.X, b))**2
    
    def TSS(self):
        """Returns the total sum of squares for this model"""
        return np.sum((self.y - np.mean(self.y))**2)
    
    def MSE(self):
        return self.RSS(self.coefficients) / self.n
    
    def r_squared(self):
        """Returns the r sqaured value for this model"""
        return 1 - self.RSS(self.coefficients) / self.TSS()

    def fit(self, method='matrix'):
        """ Fits the data to the linear regression model by estimating the coefficients.
            
            Keyword Arguements:
            method -- can be set to 'matrix' (default) or 'minimize'
        """
        if method == 'matrix':
            # Use standard matrix solution
            X_trans = np.transpose(self.X)
            A = np.linalg.inv(np.dot(X_trans, self.X))
            C = np.dot(X_trans, self.y)

            # Update estimated coefficients
            self.coefficients = np.dot(A, C)
        
        elif method == 'minimize': 
            # minimize RSS with respect to beta
            guess = np.zeros((self.p, ))
            min = minimize(self.RSS, guess)
            self.coefficients = min.x
        else:
            raise ValueError("Invalid method argument: {}".format(method))
            
    def predict(self, x):
        """Predicts y given an observation vector x"""
        y = 0
        for term in list(zip(self.coefficients, x)):
            y += term[0]*term[1]
        return y

## *Application*
***

In [21]:
# Read in Credit data
df = pd.read_csv('./data/Credit.csv', index_col=0)
# Log-transform incomes (Generally good practice)
df.Income = np.log(df.Income)
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
1,2.700757,3606,283,2,34,11,Male,No,Yes,Caucasian,333
2,4.663675,6645,483,3,82,15,Female,Yes,Yes,Asian,903
3,4.650077,7075,514,4,71,11,Male,No,No,Asian,580
4,5.003436,9504,681,3,36,11,Female,No,No,Asian,964
5,4.023242,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [22]:
# let's look only at the quantitative predictors. We will use these to predict income.
df2 = df[['Limit', 'Rating', 'Cards', 'Age', 'Education']].copy()

In [23]:
# Column vector of 1s (needed if we want an intercept term)
ones = np.full((len(df2),), 1, dtype=int)
df2.insert(0, 'Ones', ones)

In [24]:
# We now have our matrix of data X
X = np.asarray(df2)

# And our vector of outputs Y
y = np.asarray(df['Income'])

In [25]:
# Construct model
model = LinearRegression(X, y)
model.fit()

print('R^2:', model.r_squared())
print('RMSE:', np.sqrt(model.MSE()))

R^2: 0.543042632219
RMSE: 0.46682799021


In [26]:
print("Predicted value for the first training observation: {}".format(model.predict([1, 3606, 283, 2, 34, 11])))
print("Actual value for the first training observation: {}".format(y[0]))

Predicted value for the first training observation: 3.2850271887009606
Actual value for the first training observation: 2.700757003608068
