In [6]:
##imports from libraries
import pandas as pd
import numpy as np
import time
from sklearn import linear_model

## For “Individual household electric power consumption” dataset

In [20]:
## Load data and preprocessing

## Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
data = pd.read_csv(url, sep=";", low_memory=False, 
                 parse_dates={"timestamp":["Date", "Time"]}, 
                 infer_datetime_format=True, index_col="timestamp")

# Preprocess the data
data = data.dropna()
data = data.astype(float)

# Split the data into training and testing sets
n = data.shape[0]
train_ratio = 0.8
train_index = int(n * train_ratio)
X_train = data[["Global_reactive_power","Voltage","Global_intensity"]][:train_index].to_numpy()
y_train = data[["Global_active_power"]][:train_index].to_numpy().ravel()
X_test = data[["Global_reactive_power","Voltage","Global_intensity"]][train_index:].to_numpy()
y_test = data[["Global_active_power"]][train_index:].to_numpy().ravel()

In [96]:
## Closed form solution and optimal linear regressor

# Define lambda here:
lambd = 0.1 # change the value

# Calculate the closed-form solution here:
def linear_ridge_regression(X, y, lambd):
    n, m = X.shape
    w = np.linalg.inv(X.T @ X + lambd * np.identity(m)) @ X.T @ y
    return w

reg = linear_model.Ridge(alpha=lambd)
start = time.time()
# Find the optimal linear regressor here:
w1 = linear_ridge_regression(X_train, y_train, lambd)
reg.fit(X_train, y_train)
w2 = reg.coef_

end = time.time()

# Evaluate the performance on the test set
y_pred1 = X_test @ w1
y_pred2 = reg.predict(X_test)
mse1 = np.mean((y_test - y_pred1) ** 2)
mse2 = np.mean((y_test - y_pred2) ** 2)
print("Mean Squared Error:", mse1)
print("Mean Squared Error:", mse2)
print("Consumed Time in seconds:", end-start)
print("Optimal Regressor:", w1)
print("Optimal Regressor:", w2)

Mean Squared Error: 0.0019609842446522015
Mean Squared Error: 0.0018373035514526533
Consumed Time in seconds: 0.11444854736328125
Optimal Regressor: [-1.89999301e-01  3.73708342e-05  2.38842527e-01]
Optimal Regressor: [-0.18772943  0.0043301   0.24012966]


## For “Greenhouse gas observing network” dataset

In [1]:
!pip install wget

from numpy.random import multivariate_normal
from scipy.linalg import toeplitz
from numpy.random import randn

import requests
import csv
import wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=ee40d287d2424ae988a1aa937b79f9f8c4f626c62ad11dc42af8bb3b96349715
  Stored in directory: /home/lonaparte/.cache/pip/wheels/bd/a8/c3/3cf2c14a1837a4e04bd98631724e81f33f462d86a1d895fae0
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [5]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00328/ghg_data.zip'
file_name = 'datasetGHG'
wget.download(url, file_name)
!unzip datasetGHG

Archive:  datasetGHG
   creating: ghg_data/
  inflating: ghg_data/ghg.gid.site0463.dat  
  inflating: ghg_data/ghg.gid.site1889.dat  
  inflating: ghg_data/ghg.gid.site1001.dat  
  inflating: ghg_data/ghg.gid.site1296.dat  
  inflating: ghg_data/ghg.gid.site2567.dat  
  inflating: ghg_data/ghg.gid.site2864.dat  
  inflating: ghg_data/ghg.gid.site0029.dat  
  inflating: ghg_data/ghg.gid.site2484.dat  
  inflating: ghg_data/ghg.gid.site2667.dat  
  inflating: ghg_data/ghg.gid.site0983.dat  
  inflating: ghg_data/ghg.gid.site1558.dat  
  inflating: ghg_data/ghg.gid.site2016.dat  
  inflating: ghg_data/ghg.gid.site2899.dat  
  inflating: ghg_data/ghg.gid.site0818.dat  
  inflating: ghg_data/ghg.gid.site1463.dat  
  inflating: ghg_data/ghg.gid.site1176.dat  
  inflating: ghg_data/ghg.gid.site2625.dat  
  inflating: ghg_data/ghg.gid.site1864.dat  
  inflating: ghg_data/ghg.gid.site2815.dat  
  inflating: ghg_data/ghg.gid.site0866.dat  
  inflating: ghg_data/ghg.gid.site0373.dat  
  inflating

In [95]:
# read .dat to feature and label vectors
'''    
    X : `numpy.array`, shape=(n_samples, n_features)
        The features matrix
    y : `numpy.array`, shape=(n_samples, n_outputs)
        The labels matrix
'''
N = 2921 # number of data 
d = int(5232/16*15) # number of features
dy = int(5232/16) # number of outputs
count = ["%04d" % x for x in range(1, N+1)]
X = np.zeros((N, d))
y = np.zeros((N, dy))
for n in range(1, N+1):
    datContent = [i.strip().split() for i in open("./ghg_data/ghg.gid.site{}.dat".format(count[n-1])).readlines()]
    X[n-1,:] = np.array(datContent[:15]).astype(float).reshape((-1))
    y[n-1,:] = np.array(datContent[15]).astype(float)
print(X.shape)
print(y.shape)

(2921, 4905)
(2921, 327)


In [99]:
n, m = X.shape

train_ratio = 0.8
train_index = int(n * train_ratio)
X_train = X[:train_index]
y_train = y[:train_index]
X_test = X[train_index:]
y_test = y[train_index:]

In [100]:
# Define lambda here:
lambd = 0.1 # change the value

# Calculate the closed-form solution here:
def linear_ridge_regression(X, y, lambd):
    n, m = X.shape
    w = np.linalg.inv(X.T @ X + lambd * np.identity(m)) @ X.T @ y
    return w

reg = linear_model.Ridge(alpha=lambd)
start = time.time()
# Find the optimal linear regressor here:
w1 = linear_ridge_regression(X_train, y_train, lambd)
reg.fit(X_train, y_train)
w2 = reg.coef_

end = time.time()

# Evaluate the performance on the test set
y_pred1 = X_test @ w1
y_pred2 = reg.predict(X_test)
mse1 = np.mean((y_test - y_pred1) ** 2)
mse2 = np.mean((y_test - y_pred2) ** 2)
print("Mean Squared Error:", mse1)
print("Mean Squared Error:", mse2)
print("Consumed Time in seconds:", end-start)
print("Optimal Regressor:", w1)
print("Optimal Regressor:", w2)

Mean Squared Error: 78572.0359338953
Mean Squared Error: 81864.41058246764
Consumed Time in seconds: 4.498806953430176
Optimal Regressor: [[-3.24018186e-01 -3.97365479e-01 -3.78132321e-01 ... -9.07487728e-01
  -1.17797380e+00 -1.70880596e+00]
 [ 1.44567859e-02  1.37421658e-03 -1.49265314e-02 ... -2.68823371e-03
  -1.81485946e-02 -6.01930078e-02]
 [ 2.74443543e-01  6.82913669e-02  1.27600506e-02 ...  9.90902386e-02
  -3.21361258e-01  5.81243360e-02]
 ...
 [-2.41603206e+00 -8.89302257e-01  3.00106112e+00 ...  2.98104468e+00
  -2.31228225e+00  5.26471602e+00]
 [-2.56668418e+00 -2.32272827e+00  1.01233624e+00 ...  3.06039288e+00
   3.90717768e+00 -5.20977751e-02]
 [-2.34721142e-01  5.16714852e-01  1.62958768e-04 ...  2.84103145e-03
  -3.59185615e+00 -1.26026994e+00]]
Optimal Regressor: [[ 3.94074100e-04  1.76765029e-02  2.65691527e-01 ... -2.26227174e+00
  -2.63778702e+00 -3.98327296e-01]
 [ 3.54619939e-04  6.35239274e-03  9.22494187e-02 ... -1.11892145e+00
  -2.21654729e+00  7.61037887e-0