# Step 1: Import libraries and load in data

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
%matplotlib inline

In [12]:
datafile = "data/Ames_Housing_Data.tsv"

In [173]:
df = pd.read_csv(datafile, sep='\t')

In [14]:
df.shape

(2930, 82)

In [15]:
df.dtypes.value_counts() # 43 columns of categorical data, 39 columns of numeric data

object     43
int64      28
float64    11
dtype: int64

In [16]:
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [39]:
df['y-intercept'] = 1
df = df[['y-intercept', 'Lot Area', 'Overall Qual', '1st Flr SF', '2nd Flr SF', 'SalePrice']]

In [18]:
df.head()

Unnamed: 0,Lot Area,Overall Qual,1st Flr SF,2nd Flr SF,SalePrice
0,31770,6,1656,0,215000
1,11622,5,896,0,105000
2,14267,6,1329,0,172000
3,11160,7,2110,0,244000
4,13830,5,928,701,189900


Here we can see that we have 4 features that we are using to predict y, SalePrice. 

# Using Moore-Penrose Pseudoinversion

In [40]:
x = np.array(df[['y-intercept', 'Lot Area', 'Overall Qual', '1st Flr SF', '2nd Flr SF']])
y = np.array(df['SalePrice'])

In [76]:
X_train, X_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.25, random_state=6)

In [87]:
U, d, VT = np.linalg.svd(X_train)

In [138]:
U.shape

(2197, 2197)

In [88]:
D = np.diag(d)

In [98]:
D

array([[6.04842974e+05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [0.00000000e+00, 3.43238370e+04, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.11830046e+04, 0.00000000e+00,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.90659234e+01,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.02164036e+01]])

In [89]:
D_inver = np.linalg.inv(D)

In [137]:
D_inver.shape

(5, 5)

In [163]:
Dplus = np.concatenate((D_inver, np.zeros((2192, 5)).T),axis = 1)

In [164]:
Dplus.shape

(5, 2197)

In [165]:
np.dot(Dplus,U.T)

array([[ 2.20640650e-08,  2.37201840e-08,  2.07039055e-08, ...,
         3.24442988e-08,  3.18322005e-08,  2.57258363e-08],
       [-4.88939198e-07, -1.57763147e-07, -5.31377341e-07, ...,
        -3.33060513e-07,  1.05131969e-07, -6.94226315e-07],
       [ 6.74738588e-07, -1.18227222e-06,  6.76606231e-07, ...,
         7.61036984e-07, -1.14446960e-06,  8.61073851e-07],
       [ 1.04135292e-04, -2.33156823e-04, -3.00545234e-04, ...,
         4.19760229e-04, -3.17600525e-04,  2.50701648e-05],
       [-1.86320984e-03, -1.06103725e-03,  5.79470876e-04, ...,
        -2.82571625e-03, -1.01500219e-03,  9.82789311e-04]])

In [166]:
Aplus = np.dot(VT.T, np.dot(Dplus,U.T))

In [171]:
Aplus

array([[ 1.83457237e-03,  1.08214821e-03, -5.36254227e-04, ...,
         2.74878467e-03,  1.04729558e-03, -9.77918858e-04],
       [-2.43789729e-08,  7.11315915e-09, -1.33738411e-08, ...,
        -9.86668462e-09,  3.79631763e-08, -2.44326667e-08],
       [-3.41670974e-04,  9.54818717e-05,  3.72215144e-04, ...,
        -7.77849546e-04,  1.85119198e-04,  1.00884703e-04],
       [ 9.13149561e-07, -1.27512040e-06, -6.38336120e-07, ...,
         2.11909699e-06, -1.90055415e-06,  1.08164101e-06],
       [-3.15820601e-07,  6.28420099e-07, -1.24785993e-06, ...,
         3.05410162e-07,  3.33792733e-07, -6.17845573e-07]])

In [172]:
np.dot(Aplus, y_train)

array([-1.23768619e+05,  6.02692703e-01,  3.11270486e+04,  8.11514125e+01,
        4.46045201e+01])

In [168]:
w = np.dot(np.linalg.pinv(X_train), y_train)

In [169]:
w

array([-1.23768619e+05,  6.02692703e-01,  3.11270486e+04,  8.11514125e+01,
        4.46045201e+01])

In [68]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import KFold, cross_val_predict, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

In [83]:
LR = LinearRegression()
LR.fit(X_train, y_train)

LinearRegression()

In [84]:
LR.coef_

array([0.00000000e+00, 6.02692703e-01, 3.11270486e+04, 8.11514125e+01,
       4.46045201e+01])