# Test Linear Regression with ADAM

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

In [2]:
import sys
sys.path.insert(1, 'Modules/')
from Evaluation_Metric import Metric_regression
from Linear_Regression_ADAM import ADAM, ADAM_learning_rate_decay, ADAM_learning_rate_decay_full_train

## 1. Loading data set
The Diabetes dataset is loaded as a toy dataset. 
The different models are compared based on the diabetes dataset. 

Additional information are referred to: https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset (accessed 18th July 2022)


In [3]:
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

## 2. Standardization of the dataset

In [4]:
X = X - X.mean(axis=0)
X = X / X.var(axis=0)**0.5

## 3. Splitting data in train and test
The models are trained based on the training data. The performance evaluation is performed on the test data. 
Therefore, the data is splitted in train (90 %) and test (10 %).

In [5]:
n_train = int(y.shape[0] * 0.9)
X_train = X[:n_train,:]
y_train = y[:n_train]
X_test =  X[n_train:,:]
y_test = y[n_train:]

## 4. Fitting models to train
First, the reference Linear Regression with analytical solution is performed using the modul **Scikit-learn**.

In [6]:
print('1. Fit direct Linear Regression (Sklearn)')
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_sklearn = np.dot(X_test, lin_reg.coef_) + lin_reg.intercept_

1. Fit direct Linear Regression (Sklearn)


### Setting the maximum of iteration
4 models based on Stochastic Gradient Descent (SGD) with ADAM are fitted.
The maximum amount of Epoch is set for all SGD models.

In [7]:
max_epoch = 1E4
model_list = []

### SGD with ADAM

In [8]:
Gradient_descent_1 = ADAM(max_epoch=max_epoch, eta=.1, batch_size=1)
print('2. {} on train:'.format(Gradient_descent_1.name))
Gradient_descent_1(X_train, y_train) 
model_list.append(Gradient_descent_1)

2. SGD with Adam on train:
Epoch: 1000 | MSE_train: 3005.56
Epoch: 2000 | MSE_train: 3020.51
Epoch: 3000 | MSE_train: 3015.60
Epoch: 4000 | MSE_train: 3007.81
Epoch: 5000 | MSE_train: 3008.93
Epoch: 6000 | MSE_train: 3008.53
Epoch: 7000 | MSE_train: 3009.11
Epoch: 8000 | MSE_train: 3008.04
Epoch: 9000 | MSE_train: 3011.54
Epoch: 10000 | MSE_train: 3020.46
##################################################


### SGD with ADAM and learning rate decay

In [9]:
Gradient_descent_2 = ADAM_learning_rate_decay(max_epoch=max_epoch, eta=.1, batch_size=1, patience=1E2)
print('3. {} on train:'.format(Gradient_descent_2.name))
Gradient_descent_2(X_train, y_train)
model_list.append(Gradient_descent_2)

3. SGD with ADAM and learning rate decay on train:
New learning rate: 0.032
New learning rate: 0.01
New learning rate: 0.0032
New learning rate: 0.001
New learning rate: 0.00032
New learning rate: 0.0001
New learning rate: 3.2e-05
New learning rate: 1e-05
##################################################


### SGD with ADAM and learning rate decay (evaluation on validation set)

In [10]:
Gradient_descent_2_2 = ADAM_learning_rate_decay(max_epoch=max_epoch, eta=.1, batch_size=1, patience=1E2)#
print('4. {} on subtrain:'.format(Gradient_descent_2_2.name))
split = int(X_train.shape[0]*0.8)
Gradient_descent_2_2(X_train[:split], y_train[:split],
                     X_train[split:], y_train[split:]
                     )
model_list.append(Gradient_descent_2_2)

4. SGD with ADAM and learning rate decay on subtrain:
New learning rate: 0.032
New learning rate: 0.01
New learning rate: 0.0032
New learning rate: 0.001
New learning rate: 0.00032
New learning rate: 0.0001
New learning rate: 3.2e-05
New learning rate: 1e-05
##################################################


### SGD with ADAM and learning rate decay (full training)

In [11]:
Gradient_descent_3 = ADAM_learning_rate_decay_full_train(max_epoch=max_epoch, eta=.1, batch_size=1, patience=1E2)
print('6. {} on subtrain and train, respectively:'.format(Gradient_descent_3.name))
Gradient_descent_3(X_train, y_train)
model_list.append(Gradient_descent_3)


6. SGD with ADAM and learning rate decay and full training on subtrain and train, respectively:
New learning rate: 0.032
New learning rate: 0.01
New learning rate: 0.0032
New learning rate: 0.001
New learning rate: 0.00032
New learning rate: 0.0001
New learning rate: 3.2e-05
New learning rate: 1e-05
Start full training!
New learning rate: 0.032
New learning rate: 0.01
New learning rate: 0.0032
New learning rate: 0.001
New learning rate: 0.00032
New learning rate: 0.0001
New learning rate: 3.2e-05
New learning rate: 1e-05
##################################################


## 5. Evaluation of the models
### Comparing the weights and bias

In [12]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
compare_weights = pd.DataFrame()
compare_weights['Sklearn'] = np.append(lin_reg.intercept_ , lin_reg.coef_ )

for SGD_model in model_list:
    column_name = SGD_model.name
    if column_name == 'SGD with Adam':
        w = SGD_model.theta
        w_0 = SGD_model.theta_0
    
    else:
        w = SGD_model.best_theta
        w_0 = SGD_model.best_theta_0
    compare_weights[column_name] = np.append(w_0, w)
pd.set_option('display.precision', 1)
print(compare_weights.T)   

                                                        0        1     2  \
Sklearn                                             152.4 -2.1e-02 -11.3   
SGD with Adam                                       151.8  8.6e-01 -12.2   
SGD with ADAM and learning rate decay               152.0  9.4e-03 -12.6   
SGD with ADAM and learning rate decay and full ...  152.0 -5.7e-02 -12.5   

                                                       3     4     5     6  \
Sklearn                                             24.7  14.5 -35.8  20.4   
SGD with Adam                                       23.6  15.3 -61.0  40.4   
SGD with ADAM and learning rate decay               24.6  13.2 -43.9  25.4   
SGD with ADAM and learning rate decay and full ...  24.5  13.1 -47.5  28.4   

                                                       7     8     9   10  
Sklearn                                              4.7  10.2  32.8  4.6  
SGD with Adam                                       16.5  13.2  44.1  3.2  


### Model Performance
The Mean Squared Error (MSE) on the test data set is compared.
In addition, the increase in MSE is referenced to the analytical solution.

In [13]:
MSE_compare = pd.Series(dtype=float)
ARD_compare = pd.Series(dtype=float) 
MSE_sklearn = Metric_regression().fun_MSE(y_test, y_test_sklearn)
MSE_compare['Sklearn'] = MSE_sklearn
ARD_compare['Sklearn'] = 0
for SGD_model in model_list:
    column_name = SGD_model.name
    MSE_test = SGD_model.MSE(X_test, y_test)
    MSE_compare[column_name] = MSE_test
    ARD_compare[column_name] = -(MSE_test-MSE_sklearn)/MSE_sklearn*100

MSE_DF = MSE_compare.to_frame(name='MSE')
ARD_DF = ARD_compare.to_frame(name='Deviation / %')
performance_DF = pd.concat([ARD_DF, MSE_DF], axis=1)
print(performance_DF)

                                                    Deviation / %     MSE
Sklearn                                                       0.0  1735.9
SGD with Adam                                                 4.2  1662.6
SGD with ADAM and learning rate decay                         5.0  1648.8
SGD with ADAM and learning rate decay and full ...            5.0  1648.9
