## Least Squares

In [1]:
import pandas as pd
import numpy as np

### loading the data

In [2]:
data = pd.read_csv('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data',sep='\t',index_col=0)

### separating into training and test sets

In [3]:
train=data.loc[data['train']=='T']
test=data.loc[data['train']=='F']
y_train=train['lpsa']
X_train=train.drop(columns=['lpsa','train'])
y_test=test['lpsa']
X_test=test.drop(columns=['lpsa','train'])

### Normalizing the data

In [4]:
mean=np.mean(X_train)
std=np.std(X_train)
X_train-=mean
X_train/=std
X_test-=mean
X_test/=std

### Into numpy arrays

In [6]:
X_train.insert(0,'bias',1)
X_test.insert(0,'bias',1)
X=np.array(X_train)
y=np.array([y_train]).T
test_features=np.array(X_test)
test_target=np.array([y_test]).T
Xt=X.T

### Linear Regression with sklear 
to use as a comparision

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lr = LinearRegression(fit_intercept=False).fit(X_train, y_train)
train_predict=lr.predict(X_train)
test_predict=lr.predict(X_test)

### Finding the coefficients
$\beta=(X^{T}X)^{-1}X^{T}y$

In [8]:
one=np.dot(Xt,X)
one=np.linalg.inv(one)
two=np.dot(Xt,y)
coeff=np.dot(one,two)

### Making predictions based off the coefficients
$\hat{y}=X\hat{\beta}$

In [9]:
y_hat=np.dot(test_features,coeff)

### Mean Squared Error
$MSE=\dfrac{1}{n}\sum\limits_{i=1}^{n}(y_{i}-\hat{y})^{2}$

In [10]:
predict_train=np.dot(X,coeff)
predict_test=np.dot(test_features,coeff)
MSE_train=(1/X.shape[0])*np.sum((y-predict_train)**2)
MSE_test=(1/test_features.shape[0])*np.sum((test_target-predict_test)**2)
print('MSE on the training set:', MSE_train)
print('MSE on train with sklearn:', mean_squared_error(train_predict,y_train))
print('MSE on the test set:', MSE_test)
print('MSE on test with sklearn:', mean_squared_error(test_predict, y_test))

MSE on the training set: 0.43919976805833433
MSE on train with sklearn: 0.43919976805833433
MSE on the test set: 0.5212740055076007
MSE on test with sklearn: 0.5212740055076


In [11]:
show=pd.DataFrame()
show['variables']=X_train.columns
#show=show.iloc[:-1,:]
show['coefficients']=coeff
show['sklearn coef']=lr.coef_
show

Unnamed: 0,variables,coefficients,sklearn coef
0,bias,2.452345,2.452345
1,lcavol,0.711041,0.711041
2,lweight,0.29045,0.29045
3,age,-0.141482,-0.141482
4,lbph,0.21042,0.21042
5,svi,0.3073,0.3073
6,lcp,-0.286841,-0.286841
7,gleason,-0.020757,-0.020757
8,pgg45,0.275268,0.275268
