# Homework 1
## 2.1.a Linear Regression Model

**Import pandas and numpy for data manipulation**

Here we load the training and test data sets for flower identification and print their shapes to see the quantity of data.

In [10]:
import pandas as pd
import numpy as np

def read_dataset(feature_file, label_file):
    '''Read data set in *.csv format to dataframe in Pandas'''
    df_X = pd.read_csv(feature_file)
    df_Y = pd.read_csv(label_file)
    X = df_X.values #convert values in dataframe to np array
    y = df_Y.values
    return X,y

folder=''#'hw_data/linear_regression/' #data folder name
file_id='house' #data file_id

xtr_name = folder+file_id+'_X_train.csv' #specify X.vs.Y and Train.vs.Test
ytr_name = folder+file_id+'_y_train.csv'

X_train,y_train = read_dataset(xtr_name,ytr_name)

xte_name = folder+file_id+'_X_test.csv' #specify X.vs.Y and Train.vs.Test
yte_name = folder+file_id+'_y_test.csv'

X_test,y_test = read_dataset(xte_name,yte_name)

print('X_train shape: ',X_train.shape)
print('y_train shape: ',y_train.shape)
print('X_test shape: ',X_test.shape)
print('y_test shape: ',y_test.shape)

X_train shape:  (379, 12)
y_train shape:  (379, 1)
X_test shape:  (127, 12)
y_test shape:  (127, 1)


**Normalize the features and train the model**



In [11]:
def normalize_features(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler() #call a function
    scaler.fit(X_train) #calculate mean, std in X train
    X_train_norm = scaler.transform(X_train) #apply normalization
    X_test_norm = scaler.transform(X_test)
    return X_train_norm, X_test_norm

X_train_norm,X_test_norm = normalize_features(X_train,X_test)

def LR_model(X_train, y_train, X_test, y_test):
    from sklearn.linear_model import LinearRegression #import LR from sklearn
    myreg = LinearRegression() #initialize model
    myreg.fit(X_train,y_train) #learn from training data
    y_pred = myreg.predict(X_test) #predict the test data, X_test: features
    coeffs = myreg.coef_
    intercept = myreg.intercept_
    return y_pred, coeffs, intercept

lr_y_pred,lr_coeffs,lr_intercept = LR_model(X_train_norm, y_train, X_test_norm, y_test)
    
print(lr_coeffs)
print(lr_intercept)

def RMSE(y_pred,y_test):
    diff = y_pred - y_test
    return np.sqrt(sum(diff*diff)/y_pred.shape[0])

print('RMSE LR: %.3f' % RMSE(lr_y_pred,y_test))

[[-0.94680434  1.07013479 -0.07673632  0.65256803 -1.91225543  2.50105885
  -0.09761049 -3.06137125  1.84837216 -1.94588004 -2.09785905 -3.86696361]]
[22.60870712]
RMSE LR: 5.524


## LR Predictor:
$p_c(x) = c^Tx = 22.608 - 0.9468x_1 + 1.070 x_2 - 0.0767 x_3 + 0.653 x_4 - 1.912 x_5 +  2.501 x_6 - 0.0976 x_7 - 3.061 x_8 + 1.848 x_9 - 1.946 x_{10} - 2.0979 x_{11} - 3.867 x_{12}$


In [12]:
def Ridge_LR_model(X_train, y_train, X_test, y_test, alpha):
    # Ridge regression to avoid overfitting
    from sklearn.linear_model import Ridge #tikhonov regularization
    myRidge = Ridge(alpha=alpha,solver='saga') #stochastic solver
    myRidge.fit(X_train,y_train)
    y_pred = myRidge.predict(X_test)
    coeffs = myRidge.coef_
    intercept = myRidge.intercept_
    return y_pred, coeffs, intercept

for alpha in [10e5,10e3,10,1,10e-2,10e-4,10e-6]:
    ridge_y_pred,ridge_coeffs,ridge_intercept = Ridge_LR_model(X_train_norm, y_train, X_test_norm, y_test, alpha)
    print('Alpha: ',alpha)
    print('coeffs: ',ridge_coeffs)
    print('inter: ',ridge_intercept)
    print('RMSE Ridge: %.3f' % RMSE(ridge_y_pred,y_test))
    print('')

Alpha:  1000000.0
coeffs:  [[-0.00127328  0.00137247 -0.0018363   0.00054036 -0.00156882  0.00243561
  -0.00142592  0.00094756 -0.00143139 -0.0017578  -0.00193653 -0.00262877]]
inter:  [22.60870712]
RMSE Ridge: 9.036

Alpha:  10000.0
coeffs:  [[-0.11560199  0.11120815 -0.14944465  0.05162853 -0.12510401  0.21856883
  -0.11354514  0.06665489 -0.11438094 -0.14359153 -0.17015473 -0.22912673]]
inter:  [22.60870712]
RMSE Ridge: 8.431

Alpha:  10
coeffs:  [[-0.88112184  0.91104557 -0.28128664  0.68036997 -1.60706267  2.5958823
  -0.16099884 -2.72188247  1.26137323 -1.4115571  -2.03525838 -3.68849267]]
inter:  [22.60870712]
RMSE Ridge: 5.584

Alpha:  1
coeffs:  [[-0.93646105  1.04666649 -0.11689654  0.65793216 -1.87202795  2.51539236
  -0.1080716  -3.02323198  1.743768   -1.84151738 -2.08942741 -3.8461857 ]]
inter:  [22.60870712]
RMSE Ridge: 5.532

Alpha:  0.1
coeffs:  [[-0.94368071  1.06413321 -0.0927244   0.65451306 -1.90448831  2.50456347
  -0.10081502 -3.05743072  1.8111704  -1.90483003 -

## Varying Alpha

As alpha becomes too large, it suppresses the coefficients and becomes less accurate. However, below alpha ~ 10, the ridge model becomes approximately equivalent in terms of performance. In fact, it does slightly worse with an RMSE value of 5.526 vs the non-ridge regression of 5.524. The data would seem to be well behaved for the ridge to have little influence in the final result.

## Ridge Predictor:

$\alpha = 1.0$

$p_c(x) = c^Tx = 22.609 - 0.936 x_1 + 1.047 x_2 - 0.118 x_3 + 0.658 x_4 - 1.872 x_5 + 2.516 x_6 - 0.108 x_7 - 3.023 x_8 + 1.743 x_9 - 1.840 x_{10} - 2.089 x_{11} - 3.847 x_{12}$

## LR Predictor:
$p_c(x) = c^Tx = 22.608 - 0.9468x_1 + 1.070 x_2 - 0.0767 x_3 + 0.653 x_4 - 1.912 x_5 +  2.501 x_6 - 0.0976 x_7 - 3.061 x_8 + 1.848 x_9 - 1.946 x_{10} - 2.0979 x_{11} - 3.867 x_{12}$