Connected to env (Python 3.11.3)

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 17 17:21:04 2023

@author: jparedes
"""
import os
import time
import pandas as pd
import geopy.distance
from sklearn.model_selection import train_test_split
from abc import ABC,abstractmethod
import numpy as np
import sys
import warnings
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame

import sensor_placement as sp


""" Obtain signal sparsity and reconstruct signal at different temporal regimes"""

# perturbate measurements
def add_noise_signal(X:pd.DataFrame,seed:int=92,var:float=1.)->pd.DataFrame:
    """
    Add noise to measurements dataset. The noise ~N(0,var).
    The noise is the same for all sensors during all the time.

    Args:
        X (pd.DataFrame): dataset with measurements
        seed (int): random number generator seed
        var (float): noise variance

    Returns:
        pd.DataFrame: _description_
    """
    rng = np.random.default_rng(seed=seed)
    noise = rng.normal(loc=0.0,scale=var,size=X.shape)
    X_noisy = X + noise
    #X_noisy[X_noisy<0] = 0.
    return X_noisy

# ROI classes
class roi_generator(ABC):
    @abstractmethod
    def generate_rois(self,**kwargs):
        raise NotImplementedError
    
class RandomRoi(roi_generator):
    """ Regions of Interest randomly generated from rng seed"""
    def generate_rois(self,**kwargs)->dict:
        seed = kwargs['seed']
        n = kwargs['n']
        n_regions = kwargs['n_regions']
        rng = np.random.default_rng(seed=seed)    
        indices = np.arange(0,n,1)
        indices_perm = rng.permutation(indices)
        roi_idx = {el:[] for el in np.arange(n_regions)}
        indices_split = np.array_split(indices_perm,n_regions)
        for i in np.arange(n_regions):
            roi_idx[i] = indices_split[i]
        return roi_idx
    
class SubSplitRandomRoi(roi_generator):
    """
    Regions of Interest randomly generated. 
    The indices are randomly generated and then some of them are splitted into new sub regions.
    """
    def generate_rois(self,**kwargs):
        seed = kwargs['seed']
        n = kwargs['n']
        n_regions_original = kwargs['n_regions_original']
        rois_split = kwargs['rois_split']
        n_regions_subsplit = kwargs['n_regions_subsplit']
        seed_subsplit = kwargs['seed_subsplit']
        rng = np.random.default_rng(seed=seed)
        indices = np.arange(0,n,1)
        # first split. Original ROIs
        indices_perm = rng.permutation(indices)
        roi_idx = {el:[] for el in np.arange(n_regions_original)}
        indices_split = np.array_split(indices_perm,n_regions_original)
        for i in np.arange(n_regions_original):
            roi_idx[i] = indices_split[i]
        # second split. Maintain some ROIs and split others
        new_roi_idx = {}
        rng_subsplit = np.random.default_rng(seed=seed_subsplit)
        for i in roi_idx:
            if i in rois_split:
                indices_roi = roi_idx[i]
                indices_roi_perm = rng_subsplit.permutation(indices_roi)
                indices_roi_split = np.array_split(indices_roi_perm,n_regions_subsplit)
                new_dict = {}
                for j in np.arange(n_regions_subsplit):
                    new_dict[float(f'{i}.{j+1}')] = indices_roi_split[j]
                new_roi_idx.update(new_dict)
            else:
                new_roi_idx[i] = roi_idx[i]
            
        return new_roi_idx
            
    
class VarianceRoi(roi_generator):
    def generate_rois(self,**kwargs)->dict:
        coordinate_error_variance_fullymonitored = kwargs['coordinate_error_variance_fullymonitored']
        variance_thresholds = kwargs['variance_thresholds']
        n_regions = kwargs['n_regions']
        print(f'Determining indices that belong to each ROI. {n_regions} regions with thresholds: {variance_thresholds}')
        if type(variance_thresholds) is not list:
            variance_thresholds = [variance_thresholds]
        if len(variance_thresholds) != n_regions:
            raise ValueError(f'Number of variance thresholds: {variance_thresholds} mismatch specified number of regions: {n_regions}')
        roi_idx = {el:[] for el in variance_thresholds}
        for i in range(len(variance_thresholds[:-1])):
            print(f'Variance threshold between {variance_thresholds[i]} and {variance_thresholds[i+1]}')
            stations = [j for j in coordinate_error_variance_fullymonitored[np.logical_and(coordinate_error_variance_fullymonitored>=variance_thresholds[i],coordinate_error_variance_fullymonitored<variance_thresholds[i+1])]]
            print(f'{len(stations)} stations')
            idx_stations = np.where(np.isin(coordinate_error_variance_fullymonitored,stations))[0]
            roi_idx[variance_thresholds[i]] = idx_stations
        stations = [j for j in coordinate_error_variance_fullymonitored[coordinate_error_variance_fullymonitored>=variance_thresholds[-1]]]
        print(f'{len(stations)} stations with a distance larger than {variance_thresholds[-1]}')
        idx_stations = np.where(np.isin(coordinate_error_variance_fullymonitored,stations))[0]
        roi_idx[variance_thresholds[-1]] = idx_stations
        return roi_idx
    
class DistanceRoi(roi_generator):
    def generate_rois(self,**kwargs)->dict:
        """
        Generates Regions of Interest (ROIs) based on distance from certain station

        Args:        
            distances (pd.Series): distance of each location from origin station
            distance_thresholds (list): thresholds for each ROI
            n_regions (int): number of ROIs

        Raises:
            ValueError: Check if number of specified distance thresholds matches number of ROIs

        Returns:
            dict: Indices of each ROI. Key specifies the distance threshold
        """
        distances = kwargs['distances']
        distance_thresholds = kwargs['distance_thresholds']
        n_regions = kwargs['n_regions']
        print(f'Determining indices that belong to each ROI. {n_regions} regions with thresholds: {distance_thresholds}')
        if type(distance_thresholds) is not list:
            distance_thresholds = [distance_thresholds]
        if len(distance_thresholds) != n_regions:
            raise ValueError(f'Number of distance thresholds: {distance_thresholds} mismatch specified number of regions: {n_regions}')
        roi_idx = {el:[] for el in distance_thresholds}
        #distance_thresholds = np.insert(distance_thresholds,0,0)
        for i in range(len(distance_thresholds[:-1])):
            print(f'Distance threshold between {distance_thresholds[i]} and {distance_thresholds[i+1]}')
            stations = [j for j in distances[np.logical_and(distances>=distance_thresholds[i],distances<distance_thresholds[i+1])].index]
            print(f'Stations ({len(stations)}): {stations}')
            idx_stations = np.where(np.isin(distances.index,stations))[0]
            roi_idx[distance_thresholds[i]] = idx_stations
        stations = [j for j in distances[distances>=distance_thresholds[-1]].index]
        print(f'Stations with a distance larger than {distance_thresholds[-1]} ({len(stations)}): {stations}')
        idx_stations = np.where(np.isin(distances.index,stations))[0]
        roi_idx[distance_thresholds[-1]] = idx_stations
        
        return roi_idx


class ROI():
    """
    Region of interest (ROI) class. Select a generator from different roigenerator classes.
    Use as:
        roi = ROI(generator())
        roi.deine_ROIs(**kwargs)
    """
    def __init__(self,generator):
        self._generator = generator
    def define_rois(self,**kwargs)->dict:
        self.roi_idx = self._generator.generate_rois(**kwargs)

# file writer classes
class FileWriter(ABC):
    @abstractmethod
    def save(self,**kwargs):
        raise NotImplementedError

class WriteRandomFile(FileWriter):
    def save(self,results_path,locations,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_locations_monitored = kwargs['n_locations_monitored']
        random_seed = kwargs['random_seed']
        
        fname = f'{results_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_locations_monitored}_randomSeed{random_seed}.pkl'
        with open(fname,'wb') as f:
            pickle.dump(locations,f,protocol=pickle.HIGHEST_PROTOCOL)
        print(f'File saved in {fname}')

class WriteSplitRandomFile(FileWriter):
    def save(self,results_path,locations,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_locations_monitored = kwargs['n_locations_monitored']
        random_seed = kwargs['seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']
        
        fname = f'{results_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_locations_monitored}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        with open(fname,'wb') as f:
            pickle.dump(locations,f,protocol=pickle.HIGHEST_PROTOCOL)
        print(f'File saved in {fname}')

class SaveLocations():
    def __init__(self,writer):
        self._writer = writer
    def save_locations(self,results_path,locations,**kwargs):
        self._writer.save(results_path,locations,**kwargs)

# file reader class
class FileReader(ABC):
    @abstractmethod
    def load(self,**kwargs):
        raise NotImplementedError

class ReadRandomFile(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['signal_threshold_ratio']
        n_sensors = kwargs['n_sensors']
        random_seed = kwargs['random_seed']
        fname = f'{file_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors}_randomSeed{random_seed}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
class ReadSplitRandomFile(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_sensors = kwargs['n_sensors']
        random_seed = kwargs['random_seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']

        fname = f'{file_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
    
class ReadRandomFileBoyd(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        random_seed = kwargs['random_seed']
        n_sensors_Dopt = kwargs['n_sensors_Dopt']
        fname = f'{file_path}SensorsLocations_Boyd_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors_Dopt}_randomSeed{random_seed}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
    
class ReadSplitRandomFileBoyd(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_sensors_Dopt = kwargs['n_sensors_Dopt']
        random_seed = kwargs['random_seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']
        fname = f'{file_path}SensorsLocations_Boyd_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors_Dopt}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        try:
            with open(fname,'rb') as f:
                locations_monitored = np.sort(pickle.load(f))
            print(f'Loaded file {fname}')
        except:
            warnings.warn(f'No file {fname}')
            return 
        return locations_monitored
    
class ReadLocations():
    def __init__(self,reader):
        self._reader = reader
    def load_locations(self,file_path,**kwargs):
        locations_monitored = self._reader.load(file_path,**kwargs)
        return locations_monitored


# signal reconstruction functions
def singular_value_hard_threshold(snapshots_matrix:np.ndarray,sing_vals:np.array)->float:
    """
    Compute singular value hard threshold from Gavish-Donoho approximation

    Args:
        snapshots_matrix (np.ndarray): snapshots matrix used for computing SVD
        sing_vals (np.array): corresponding array of singular values

    Returns:
        float: cut-off index
    """
    beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]
    c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
    omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
    sing_val_threshold = omega*np.median(sing_vals)
    sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]
    return sparsity_gd

def signal_reconstruction_svd(U:np.ndarray,snapshots_matrix_train:np.ndarray,snapshots_matrix_centered:np.ndarray,X_dataset:pd.DataFrame,s_range:np.ndarray) -> pd.DataFrame:
    """
    Decompose signal keeping s-first singular vectors using training set data
    and reconstruct validation set.

    Args:
        U (numpy array): left singular vectors matrix
        snapshots_matrix_train (numpy array): snaphots matrix of training set data. Used for computing average snapshot
        snapshots_matrix_centered (numpy array): (centered) snapshots matrix to be projected
        X_dataset (pandas dataframe): dataset with n_rows measurements and n_cols locations. (Uncentered) snapshots_matrix.T
        s_range (numpy array): list of sparsity values to test

    Returns:
        rmse_sparsity: dataframe containing reconstruction errors at different times for each sparsity threshold in the range
    """
    print(f'Determining signal sparsity by decomposing training set and reconstructing validation set.\nRange of sparsity levels: {s_range}')
    mse_sparsity = pd.DataFrame()
    for s in s_range:
        # projection
        Psi = U[:,:s]
        snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix_centered) + snapshots_matrix_train.mean(axis=1)[:,None]
        X_pred_svd = pd.DataFrame(snapshots_matrix_pred_svd.T)
        X_pred_svd.columns = X_dataset.columns
        X_pred_svd.index = X_dataset.index
        
        #RMSE across different signal measurements
        # estimated covariance
        error = X_dataset - X_pred_svd
        mse = pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)
        mse_sparsity = pd.concat((mse_sparsity,mse),axis=1)
        error_variance = error.var(axis=0,ddof=0) # estiamted coordinate error variance

    return mse_sparsity

def signal_reconstruction_regression(Psi:np.ndarray,locations_measured:np.ndarray,X_test:pd.DataFrame,X_test_measurements:pd.DataFrame=[],snapshots_matrix_train:np.ndarray=[],snapshots_matrix_test_centered:np.ndarray=[],projected_signal:bool=False,sample_covariance:bool=True)->pd.DataFrame:
    """
    Signal reconstyruction from reduced basis measurement.
    The basis Psi and the measurements are sampled at indices in locations_measured.
    Compute reconstruction error


    Args:
        Psi (np.ndarray): low-rank basis
        locations_measured (np.ndarray): indices of locations measured
        X_test (pd.DataFrame): testing dataset which is measured and used for error estimation
        X_test_measurements (pd.DataFrame): testing dataset measurements projected onto subspace spanned by Psi
        snapshots_matrix_train (np.ndarray): training set snapshots matrix used for computing average
        snapshots_matrix_val_centered (np.ndarray): testing set centered snapshots matrix used for signal reconstruction
        

    Returns:
        rmse (pd.DataFrame): mean reconstruction error between validation data set and reconstructed data
        error_max (pd.DataFrame): max reconstruction error when comparing validation data with reconstructed data
    """
    # basis measurement
    n_sensors_reconstruction = len(locations_measured)
    C = np.identity(Psi.shape[0])[locations_measured]
    Psi_measured = C@Psi
    # regression
    if projected_signal:
        beta_hat = np.linalg.pinv(Psi_measured)@X_test_measurements.iloc[:,locations_measured].T
        snapshots_matrix_predicted = Psi@beta_hat
    else:
        beta_hat = np.linalg.pinv(Psi_measured)@snapshots_matrix_test_centered[locations_measured,:]
        snapshots_matrix_predicted_centered = Psi@beta_hat
        snapshots_matrix_predicted = snapshots_matrix_predicted_centered + snapshots_matrix_train.mean(axis=1)[:,None]
    # compute prediction
    X_pred = pd.DataFrame(snapshots_matrix_predicted.T)
    X_pred.columns = X_test.columns
    X_pred.index = X_test.index
    # compute error metrics
    error = X_test - X_pred
    rmse = pd.DataFrame(np.sqrt(((error)**2).mean(axis=1)),columns=[n_sensors_reconstruction],index=X_test.index)
    error_variance = error.var(axis=0,ddof=0)
    """
    error_max = pd.DataFrame(np.abs(error).max(axis=1),columns=[n_sensors_reconstruction],index=X_test.index)
    error_var = np.zeros(shape = error.shape)
    for i in range(error.shape[0]):
        error_var[i,:] = np.diag(error.iloc[i,:].to_numpy()[:,None]@error.iloc[i,:].to_numpy()[:,None].T)
    error_var = pd.DataFrame(error_var,index=X_test.index,columns=X_test.columns)
    """
    return rmse, error_variance

def hourly_signal_reconstruction(Psi:np.ndarray,X_train:pd.DataFrame,X_val:pd.DataFrame,signal_sparsity:int=1,locations_measured:np.ndarray=[])->dict:
    """
    Compute reconstruction error at different times using low-rank basis
    Args:
        Psi (np.ndarray): monitored low-rank basis
        X_train (pd.DataFrame): training set measurements 
        X_val (pd.DataFrame): validation set measurements
        signal_sparsity (int): sparsity threshold
        locations_measured (np.ndarray): indices of monitored locations

    Returns:
        dict: rmse for multiple measurements at different times
    """
    hours_range = np.sort(X_train.index.hour.unique())
    rmse_time = {el:[] for el in hours_range}
    for h in hours_range:
        # get measurements at certain hour and rearrange as snapshots matrix
        X_train_hour = X_train.loc[X_train.index.hour == h]
        X_val_hour = X_val.loc[X_val.index.hour==h]
        snapshots_matrix_train_hour = X_train_hour.to_numpy().T
        snapshots_matrix_train_hour_centered = snapshots_matrix_train_hour - snapshots_matrix_train_hour.mean(axis=1)[:,None]
        snapshots_matrix_val_hour = X_val_hour.to_numpy().T
        snapshots_matrix_val_hour_centered = snapshots_matrix_val_hour - snapshots_matrix_val_hour.mean(axis=1)[:,None]
        if len(locations_measured) != 0:
            rmse_hour = signal_reconstruction_regression(Psi,locations_measured,snapshots_matrix_train_hour,snapshots_matrix_val_hour_centered,X_val_hour)
        else:# not using sensor placement procedure. Use simple svd reconstruction
            rmse_hour = signal_reconstruction_svd(Psi,snapshots_matrix_train_hour,snapshots_matrix_val_hour_centered,X_val_hour,[signal_sparsity])
        rmse_time[h] = rmse_hour
    return rmse_time

def networkPlanning_iterative(sensor_placement:sp.SensorPlacement,N:int,Psi:np.ndarray,deployed_network_variance_threshold:float,epsilon:float,h_prev:np.ndarray,weights:np.ndarray,n_it:int,locations_monitored:list=[],locations_unmonitored:list=[])->list:
    """
    IRL1 network planning algorithm
    Args:
        sensor_placement (sp.SensorPlacement): sensor placement object containing network information
        N (int): total number of network locations
        deployed_network_variance_threshold (float): error variance threshold for network design
        epsilon (float): IRL1 weights update constant
        h_prev (np.ndarray): network locations initialization
        weights (np.ndarray): IRL1 weights initialization
        n_it (int): IRL1 max iterations
        locations_monitored (list, optional): initialization of set of monitored lcoations. Defaults to [].
        locations_unmonitored (list, optional): initialization of set of unmonitored locaitons. Defaults to [].

    Returns:
        locations (list): indices of monitored and unmonitored locations [S,Sc]
    """
    # iterative method
    it = 0
    time_init = time.time()
    new_monitored = []
    new_unmonitored = []
    while len(locations_monitored) + len(locations_unmonitored) != N:
        # solve sensor placement with constraints
        
        sensor_placement.initialize_problem(Psi,rho=deployed_network_variance_threshold,
                                            w=weights,locations_monitored=locations_monitored,locations_unmonitored=locations_unmonitored)
        sensor_placement.solve()
        print(f'Problem status: {sensor_placement.problem.status}')
        if sensor_placement.problem.status == 'optimal':
            # update sets with new monitored locations
            new_monitored = [i[0] for i in np.argwhere(sensor_placement.h.value >= 1-epsilon) if i[0] not in locations_monitored]
            new_unmonitored = [i[0] for i in np.argwhere(sensor_placement.h.value <= epsilon) if i[0] not in locations_unmonitored]

            locations_monitored += new_monitored
            locations_unmonitored += new_unmonitored
            # check convergence
            if np.linalg.norm(sensor_placement.h.value - h_prev)<=epsilon or it==n_it:
                locations_monitored += [[i for i in np.argsort(sensor_placement.h.value)[::-1] if i not in locations_monitored][0]]
                it = 0
            h_prev = sensor_placement.h.value
            weights_old = weights.copy()
            weights = 1/(h_prev + epsilon)
            it +=1
        else:
            # solver fails at iteration
            #locations_monitored = locations_monitored[:-len(new_monitored)]
            if len(new_unmonitored) != 0:
                locations_unmonitored = locations_unmonitored[:-len(new_unmonitored)]
                weights = weights_old
            it+=1

        print(f'{len(locations_monitored)} Locations monitored: {locations_monitored}\n{len(locations_unmonitored)} Locations unmonitored: {locations_unmonitored}\n')
    time_end = time.time()
    locations = [locations_monitored,locations_unmonitored]
    print(f'IRL1 algorithm finished in {time_end-time_init:.2f}s.')
    return locations

# dataset
class Dataset():
    def __init__(self,pollutant:str='O3',N:int=44,start_date:str='2011-01-01',end_date:str='2022-12-31',files_path:str='',synthetic_dataset:bool=False):
        self.pollutant = pollutant
        self.N = N
        self.start_date = start_date
        self.end_date = end_date
        self.files_path = files_path
        self.synthetic_dataset = synthetic_dataset
    
    def load_dataset(self):
        if self.synthetic_dataset:
            fname = f'{self.files_path}SyntheticData_{self.start_date}_{self.end_date}.csv'
        else:
            fname = f'{self.files_path}{self.pollutant}_catalonia_clean_N{self.N}_{self.start_date}_{self.end_date}.csv'
            self.coordinates = pd.read_csv(f'{self.files_path}coordinates.csv',index_col=0)
            self.coordinates_distances = pd.DataFrame([],index=self.coordinates.index,columns=self.coordinates.index)
            for i in range(self.coordinates.shape[0]):
                for j in range(self.coordinates.shape[0]):
                    self.coordinates_distances.iloc[i,j] = geopy.distance.geodesic(self.coordinates.iloc[i,:],self.coordinates.iloc[j,:]).km

        print(f'Loading dataset from {fname}')
        self.ds = pd.read_csv(fname,sep=',',index_col=0)
        self.ds.index = pd.to_datetime(self.ds.index)
        

    def check_dataset(self):
        print(f'Checking missing values in dataset')
        print(f'Percentage of missing values per location:\n{100*self.ds.isna().sum()/self.ds.shape[0]}')
        print(f'Dataset has {self.ds.shape[0]} measurements for {self.ds.shape[1]} locations.\n{self.ds.head()}')

#%%
# figures
class Figures():
    def __init__(self,save_path,figx=2.5,figy=2.5,fs_title=10,fs_label=10,fs_ticks=10,fs_legend=10,marker_size=3,dpi=300,use_grid=False,show_plots=False):
        self.figx = figx
        self.figy = figy
        self.fs_title = fs_title
        self.fs_label = fs_label
        self.fs_ticks = fs_ticks
        self.fs_legend = fs_legend
        self.marker_size = marker_size
        self.dpi = dpi
        self.save_path = save_path
        if show_plots:
            self.backend = 'Qt5Agg'
        else:
            self.backend = 'Agg'
        
        print('Setting mpl rcparams')
        
        font = {'weight':'normal',
                'size':str(self.fs_label),
                }
        
        lines = {'markersize':self.marker_size}
        
        fig = {'figsize':[self.figx,self.figy],
               'dpi':self.dpi
               }
        
        ticks={'labelsize':self.fs_ticks
            }
        axes={'labelsize':self.fs_ticks,
              'grid':False,
              'titlesize':self.fs_title
            }
        if use_grid:
            grid = {'alpha':0.5}
            mpl.rc('grid',**grid)
        
        mathtext={'default':'regular'}
        legend = {'fontsize':self.fs_legend}
        
        mpl.rc('font',**font)
        mpl.rc('figure',**fig)
        mpl.rc('xtick',**ticks)
        mpl.rc('ytick',**ticks)
        mpl.rc('axes',**axes)
        mpl.rc('legend',**legend)
        mpl.rc('mathtext',**mathtext)
        mpl.rc('lines',**lines)        
        mpl.use(self.backend)

    def curve_timeseries_singlestation(self,X:pd.DataFrame,station_name:str,date_init:str='2020-01-20',date_end:str='2021-10-27'):
        date_range = pd.date_range(start=date_init,end=date_end,freq='H')
        date_idx = [i for i in date_range if i in X.index]
        data = X.loc[date_idx,[station_name]]
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.set_xlabel('date')
        ax.set_ylabel('Concentration ($\mu$g/$m^3$)')
        fig.tight_layout()

    def curve_timeseries_allstations(self,X:pd.DataFrame,date_init:str='2020-01-20',date_end:str='2021-10-27',save_fig=False):
        date_range = pd.date_range(start=date_init,end=date_end,freq='H')
        date_idx = [i for i in date_range if i in X.index]
        data = X.loc[date_idx]
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.fill_between(x=data.index,y1=np.percentile(X,axis=1,q=25),y2=np.percentile(X,axis=1,q=75))
        ax.set_xlabel('date')
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        fig.tight_layout()

        if save_fig:
            fname = self.save_path+'timeseries_Allstations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')

    
    def curve_timeseries_dailypattern_singlestation(self,X:pd.DataFrame,station_name:str):
        X_ = X.loc[:,station_name].copy()
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.fill_between(x=data.index,y1=q1,y2=q3,alpha=0.5)
        ax.set_xlabel('hour')
        yrange = np.arange(0,110,10)
        ax.set_yticks(yrange)
        ax.set_yticklabels([i for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        ax.set_ylim(0,100)
        fig.tight_layout()
    
    def curve_timeseries_dailypattern_multiplestations(self,X:pd.DataFrame,stations_locs:list=[0,1,2,3],save_fig:bool=False):
        stations_names = [i for i in X.columns[stations_locs]]
        colors = ['#1a5276','orange','#117864','#943126']
        X_ = X.iloc[:,stations_locs].copy()
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)

        
        fig = plt.figure()
        curves = {}
        for i in range(len(stations_locs)):
            ax = fig.add_subplot(221+i)
            curves[i] = ax.plot(data.iloc[:,i],label=stations_names[i],color=colors[i])
            ax.fill_between(x=data.index,y1=q1.iloc[:,i],y2=q3.iloc[:,i],alpha=0.5,color=colors[i])
            yrange = np.arange(0,110,10)
            ax.set_yticks(yrange)
            ax.set_yticklabels([i for i in ax.get_yticks()])    
            if (221+i)%2 == 1:
                ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
            ax.set_ylim(0,100)
            if i in [2,3]:
                ax.set_xlabel('hour')

        handles = [curves[i][0] for i in curves.keys()]
        fig.legend(handles=[i for i in handles],ncol=2,bbox_to_anchor=(0.95,1.15),framealpha=1)
        fig.tight_layout()

        if save_fig:
            fname = f'{self.save_path}Curve_TimeSeriesHourly_ManyStations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved into {fname}')
        
    def curve_timeseries_dailypattern_allstations(self,X:pd.DataFrame):
        X_ = pd.DataFrame()
        for c in X.columns:
            X_ = pd.concat((X_,X.loc[:,c]),axis=0)
        X_ = X_.loc[:,0]
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.fill_between(x=data.index,y1=q1,y2=q3,alpha=0.5)
        ax.set_xlabel('hour')
        yrange = np.arange(0,110,10)
        ax.set_yticks(yrange)
        ax.set_yticklabels([i for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        ax.set_ylim(0,100)
        fig.tight_layout()

    def boxplot_measurements(self,X,save_fig):
        n = X.shape[1]
        yrange = np.arange(0.0,300,50)
        xrange = np.arange(1,n+1,1)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=X,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[i for i in range(len(xrange))],widths=0.5,labels=[str(i) for i in xrange],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        
        xrange = [i-1 for i in xrange if i%5==0]
        ax.set_xticks(xrange)
        ax.set_xticklabels([int(i+1) for i in xrange],rotation=0)
        ax.set_xlabel('Location index')
        fig.tight_layout()
        if save_fig:
            fname = self.save_path+'boxplot_concentration_allStations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')

    def geographical_network_visualization(self,map_path:str,df_coordinates:pd.DataFrame,locations_monitored:np.array=[],roi_idx:dict={},show_legend:bool=False,show_deployed_sensors:bool=True,save_fig:bool=False)->plt.figure:
        """
        Figure showing the geographical area where sensors are deployed along with coordinates of reference stations

        Args:
            map_path (str): path to map file
            df_coordinates (pd.DataFrame): dataframe containing coordiantes(Latitude,Longitude) of each reference station
            locations_monitored (np.array, optional): indices of monitored locations. Defaults to [].
            roi_idx (dict): dictionary indicating indices that belong to each region of interest (ROI) in case of heterogeneous design. The keys correspond to parameter used for separating ROIs.
            show_legend (bool, optional): Show legend indicating monitored and unmonitored locations. Defaults to False.
            save_fig (bool, optional): save generated figure. Defaults to False.

        Returns:
            plt.figure: Figure with map and stations 
        """
        
        if len(locations_monitored)!=0:
            df_coords_monitored = df_coordinates.iloc[locations_monitored]
            df_coords_unmonitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i not in locations_monitored]]
            geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
            geometry_unmonitored = [Point(xy) for xy in zip(df_coords_unmonitored['Longitude'], df_coords_unmonitored['Latitude'])]
            gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
            gdf_unmonitored = GeoDataFrame(df_coords_unmonitored, geometry=geometry_unmonitored)

        else:
            df_coords_monitored = df_coordinates.copy()
            geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
            gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
        
        spain = gpd.read_file(f'{map_path}ll_autonomicas_inspire_peninbal_etrs89.shp')
        catalonia = spain.loc[spain.NAME_BOUND.str.contains('Catalunya')]
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        geo_map = catalonia.plot(ax=ax,color='#117a65')
        
        try:
            if len(roi_idx)!=0:
                markers = ['^','o','s','P','D']
                colors = ['k','#943126']
                if show_deployed_sensors:
                    print('Map showing monitored and unmonitored locations for each ROI')
                    for i,idx,m in zip(range(len(roi_idx)),roi_idx.values(),markers):
                        #locations_monitored_roi = np.array(locations_monitored)[np.isin(locations_monitored,idx)]
                        locations_monitored_roi = np.array([i for i in locations_monitored if i in idx])
                        locations_unmonitored_roi = np.array([i for i in range(df_coordinates.shape[0]) if i not in locations_monitored and i in idx])
                        print(f'locations monitored for ROI {i}: {len(locations_monitored_roi)}\nlocations unmonitored for ROI {i}: {len(locations_unmonitored_roi)}')
                        # monitored locations in ROI
                        df_coords_monitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in locations_monitored_roi]]
                        geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
                        gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
                        gdf_monitored.plot(ax=geo_map, marker=m, color=colors[1], markersize=6,label=f'$\mathcal{{R}}_{i+1}{{\cap}}\mathcal{{S}}$')
                        
                        # unmonitored locations in ROI
                        df_coords_unmonitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in locations_unmonitored_roi]]
                        print(f'Shape of unmonitored dataframe coordinates: {df_coords_unmonitored.shape}')
                        geometry_unmonitored = [Point(xy) for xy in zip(df_coords_unmonitored['Longitude'], df_coords_unmonitored['Latitude'])]
                        gdf_unmonitored = GeoDataFrame(df_coords_unmonitored, geometry=geometry_unmonitored)
                        gdf_unmonitored.plot(ax=geo_map, marker=m, color=colors[0], markersize=6,label=f'$\mathcal{{R}}_{i+1}{{\cap}}\mathcal{{S}}^{{c}}$') 

                else: # show icons belonging to each ROI
                    for i,idx,m,c in zip(range(len(roi_idx)),roi_idx.values(),markers,colors):
                        
                        df_coords_idx = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in idx]]
                        geometry_idx = [Point(xy) for xy in zip(df_coords_idx['Longitude'], df_coords_idx['Latitude'])]
                        gdf_monitored = GeoDataFrame(df_coords_idx, geometry=geometry_idx)
                        gdf_monitored.plot(ax=geo_map, marker=m, color=c, markersize=6,label=f'$\mathcal{{R}}_{i+1}$')
                
            else:
                gdf_monitored.plot(ax=geo_map, marker='o', color='#943126', markersize=6,label=f'Monitoring node')
                gdf_unmonitored.plot(ax=geo_map, marker='o', color='k', markersize=6,label=f'Unmonitored locations')
        except:
            warnings.warn('No unmonitored locations or unexpected error in dataframe')
        ax.set_xlim(0.0,4.0)
        ax.set_ylim(40.5,43)
        
        ax.set_ylabel('Latitude (degrees)')
        ax.set_xlabel('Longitude (degrees)')

        # set legend location
        if show_legend:
            if show_deployed_sensors:
                if len(roi_idx) == 2:
                    ax.legend(loc='center',ncol=len(roi_idx),framealpha=0,
                              handletextpad=-0.8,columnspacing=5e-4,labelspacing=0.1,bbox_to_anchor=(0.73,0.1))
                elif len(roi_idx)==3:
                    ax.legend(loc='center',ncol=len(roi_idx),framealpha=0,
                              handletextpad=-0.8,columnspacing=1e-6,labelspacing=0.05,bbox_to_anchor=(0.6,0.1))
            else:
                ax.legend(loc='lower right',ncol=1,framealpha=0.1,handletextpad=-0.1,columnspacing=0.5)
        ax.tick_params(axis='both', which='major')
        fig.tight_layout()
        
        # save generated figure
        if save_fig:
            if show_deployed_sensors:
                fname = self.save_path+f'Map_PotentialLocations_{len(roi_idx)}ROIs.png'
            else:
                if len(roi_idx)!=0:
                    fname = self.save_path+f'Map_PotentialLocations_{len(roi_idx)}ROIs.png'
                else:
                    fname = self.save_path+f'Map_PotentialLocations.png'
            fig.savefig(fname,dpi=600,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')
        return fig
        

    # Low-rank plots
    def singular_values_cumulative_energy(self,sing_vals,n,synthetic_dataset=False,save_fig=False):
        """
        Plot sorted singular values ratio and cumulative energy

        Parameters
        ----------
        sing_vals : numpy array
            singular values
        n : int
            network size
        save_fig : bool, optional
            save generated figures. The default is False.

        Returns
        -------
        None.

        """
        cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
        xrange = np.arange(0,sing_vals.shape[0],1)
        fig1 = plt.figure()
        ax = fig1.add_subplot(111)
        ax.plot(xrange,cumulative_energy,color='#1f618d',marker='o')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()])
        ax.set_xlabel('$i$th singular value')
        
        #yrange = np.arange(0.5,1.05,0.05)
        yrange = np.arange(0.,1.2,0.2)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        ax.set_ylabel('Cumulative energy')
        if synthetic_dataset:
            ax.set_yscale('log')
        fig1.tight_layout()
        
        fig2 = plt.figure()
        ax = fig2.add_subplot(111)
        ax.plot(xrange, sing_vals / np.max(sing_vals),color='#1f618d',marker='o')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()],rotation=0)
        ax.set_xlabel('$i$th singular value')

        yrange = np.logspace(-4,0,5)
        ax.set_yticks(yrange)
        ax.set_ylabel('Normalized singular values')
        ax.set_ylim(1e-2,1)
        ax.set_yscale('log')
        if synthetic_dataset:
            ax.set_yscale('log')
        fig2.tight_layout()
        
        if save_fig:
            fname = self.save_path+f'Curve_sparsity_cumulativeEnergy_N{n}.png'
            fig1.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at: {fname}')

            fname = self.save_path+f'Curve_sparsity_singularValues_N{n}.png'
            fig2.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at: {fname}')
    
    def singular_values_cumulative_energy_sameFigure(self,sing_vals,n,save_fig=False):
        """
        Plot sorted singular values ratio and cumulative energy in the same figure

        Parameters
        ----------
        sing_vals : numpy array
            singular values
        n : int
            network size
        save_fig : bool, optional
            save generated figures. The default is False.

        Returns
        -------
        None.

        """
        cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
        xrange = np.arange(0,sing_vals.shape[0],1)
        fig = plt.figure(constrained_layout=True)
        ax = fig.add_subplot(111)

        l1 = ax.plot(xrange, sing_vals / np.max(sing_vals),color='#ba4a00',marker='o',label='Normalized singular values')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()],rotation=0)
        ax.set_xlabel('$i$th singular value')
        yrange = np.logspace(-4,0,5)
        ax.set_yticks(yrange)
        ax.set_ylabel('Normalized singular values')
        ax.set_ylim(1e-2,1)
        ax.set_yscale('log')

        ax2 = ax.twinx()
        l2 = ax2.plot(xrange,cumulative_energy,color='#1f618d',marker='o',label='Cumulative energy')
        ax2.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax2.set_xticklabels([int(i+1) for i in ax2.get_xticks()])
        
        yrange = np.arange(0.,1.2,0.2)
        ax2.set_yticks(yrange)
        ax2.set_yticklabels([np.round(i,2) for i in ax2.get_yticks()])
        #ax2.set_ylabel('Cumulative energy')
        ax2.set_ylim(0,1)
        
        lines = l1+l2
        labels = [l.get_label() for l in lines]
        #ax.legend(lines,labels,loc='center',ncol=1,framealpha=1.,bbox_to_anchor=(0.5,1.15),handlelength=0.5,handletextpad=0.1)
        #fig.tight_layout()
        
        if save_fig:
            fname = self.save_path+f'Curve_singVals_cumulativeEnergy_N{n}.png'
            fig.savefig(fname,dpi=600,format='png',bbox_inches='tight')
            print(f'Figure saved at: {fname}')


    def boxplot_validation_rmse_svd(self,rmse_sparsity,n,max_sparsity_show=10,synthetic_dataset=False,save_fig=False) -> plt.figure:
        yrange = np.arange(0.0,35,5)
        xrange = rmse_sparsity.columns[:max_sparsity_show]
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=rmse_sparsity.iloc[:,:max_sparsity_show],notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[i for i in range(len(xrange))],widths=0.5,labels=[str(i) for i in xrange],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        if synthetic_dataset:
            ax.set_yscale('log')
            ax.set_ylim(1e-2,1e1)
        else:
            ax.set_ylim(0,30)
        ax.set_ylabel('RMSE ($\mu$g/$m^3$)')
        xrange = np.array([i-1 for i in xrange if i%5==0])
        ax.set_xticks(xrange)
        ax.set_xticklabels([int(i+1) for i in xrange],rotation=0)
        ax.set_xlabel('Sparsity level')
        fig.tight_layout()

        if save_fig:
            fname = self.save_path+f'boxplot_RMSE_SVDreconstruction_validationSet_Smin{xrange.min()}_Smax{xrange.max()}_N{n}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved in {fname}')
    
        return fig
    
    def boxplot_rmse_comparison(self,rmse_method1:pd.DataFrame,rmse_method2:pd.DataFrame,maxerror:bool=False,save_fig:bool=False)->plt.figure:
        """
        Boxplot comparing validation set RMSE using 2 different numbers of deployed senors.
        E.g: compare fully monitored vs reduced

        Args:
            rmse_method1 (pd.DataFrame): rmse for certain number of sensors
            rmse_method2 (pd.DataFrame): rmse for different number of sensors (for example fully monitored)
            maxerror (bool, optional): dataframes contain maximum reconstruction error instead of RMSE. Defaults to False.
            save_fig (bool, optional): Save generqated figure. Defaults to False.

        Returns:
            plt.figure: Figure
        """
        n_sensors_1 = rmse_method1.columns[0]
        n_sensors_2 = rmse_method2.columns[0]

        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp1 = ax.boxplot(x=rmse_method1,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[0],widths=0.5,labels=[n_sensors_1],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        bp2 = ax.boxplot(x=rmse_method2,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[1],widths=0.5,labels=[n_sensors_2],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        bp1['boxes'][0].set_facecolor('lightgreen')
        bp2['boxes'][0].set_facecolor('#1a5276')
        
        if maxerror:
            yrange = np.arange(0.,55.,5)
            ax.set_ylim(0,50)
        else:
            yrange = np.arange(0.,22.,2)
            ax.set_ylim(0,20)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])

        if maxerror:
            ax.set_ylabel('Max error ($\mu$g/$m^3$)')        
        else:
            ax.set_ylabel('RMSE ($\mu$g/$m^3$)')        
        ax.set_xlabel('Number of deployed sensors')
        fig.tight_layout()

        if save_fig:
            if maxerror:
                fname = f'{self.save_path}Maxerrorcomparison_NsensorsTotal_N1{n_sensors_1}_N2{n_sensors_2}.png'
            else:
                fname = f'{self.save_path}RMSEcomparison_NsensorsTotal_N1{n_sensors_1}_N2{n_sensors_2}.png'
            fig.savefig(fname,dpi=300,format='png')
    
        return fig
    
    def boxplot_errorratio(self,df_error1:pd.DataFrame,df_error2:pd.DataFrame,save_fig:bool=False)->plt.figure:
        n_sensors1 = df_error1.columns[0]
        n_sensors2 = df_error2.columns[0]
        df_ratio = df_error1.to_numpy() / df_error2.to_numpy()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=df_ratio,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[0],widths=0.5,labels=[f'{n_sensors1} sensors vs {n_sensors2} senors'],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        
        bp['boxes'][0].set_facecolor('#1a5276')
        
        yrange = np.arange(0.,3.5,0.5)
        ax.set_ylim(0,3)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])

        ax.set_ylabel('Reconstruction errors ratio')        
        ax.set_xlabel('')
        fig.tight_layout()

        if save_fig:
            fname = f'{self.save_path}ErrorRatio_NsensorsTotal_N1{n_sensors1}_N2{n_sensors2}.png'
            fig.savefig(fname,dpi=300,format='png')
    
        return fig
    
    def hist_worsterror(self,errormax_fullymonitored,errormax_reconstruction,n_sensors,save_fig=False):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(x=errormax_fullymonitored,bins=np.arange(0.,5.1,0.1),density=True,cumulative=False,color='#1a5276',label='Fully monitored network')
        ax.vlines(x=errormax_fullymonitored.mean(),ymin=0.0,ymax=1.0,colors='#1a5276',linestyles='--')
        ax.hist(x=errormax_reconstruction,bins=np.arange(0.,5.1,0.1),density=True,cumulative=False,color='orange',label=f'Reconstruction with {n_sensors} sensors',alpha=0.5)
        ax.vlines(x=errormax_reconstruction.mean(),ymin=0.0,ymax=1.0,colors='orange',linestyles='--')
        ax.set_xlabel('Maximum reconstruction error')
        ax.set_ylabel('Probability density')
        ax.legend(loc='upper left',ncol=1,framealpha=0.5)
        ax.set_xlim(0,5)
        ax.set_ylim(0,1)
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}Histogram_error_fullymonitored_vs_reconstruction_Nsensors{n_sensors}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at {fname}')

    def hist_errorratio(self,errormax_fullymonitored,errormax_reconstruction,n_sensors,save_fig=False):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(x=errormax_reconstruction.to_numpy()/errormax_fullymonitored.to_numpy(),bins=np.arange(0,3.1,0.1),density=True,cumulative=False,color='#1a5276')
        ax.set_xlabel('Maximum error ratio')
        ax.set_ylabel('Probability density')
        ax.set_xlim(0,3)
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}Histogram_errorRatio_Nsensors{n_sensors}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at {fname}')
    
    def curve_errorvariance_comparison(self,errorvar_fullymonitored:list,errorvar_reconstruction:list,variance_threshold_ratio:float,worst_coordinate_variance_fullymonitored:float,n:int,n_sensors:int,errorvar_reconstruction_Dopt:list=[],roi_idx:dict={},n_sensors_Dopt:int=0,method:str='random_based',random_seed:int=0,save_fig:bool=False) -> plt.figure:
        """
        Show error variance over a testing set at each network location. 
        The error variance is obtained after reconstructing the signal from p measurements.
        The p measurement locations are obtained from network design algorithm or D-optimality criteria.
        It also shows the threshold line which the network design algorithm used.
        Another algorithm can be shown for comparison.

        Args:
            errorvar_fullymonitored (list): error variance at each network location obtained with a fully monitored network. This corresponds to the lowest error variance possible.
            errorvar_reconstruction (list): error variance at each network locations obtained with a network with a reduced number of deployed sensors.
            variance_threshold_ratio (float): variance threshold ratio used for design algorithm. It is a multiple of the worst_coordinate_variance_fullymonitored.
            worst_coordinate_variance_fullymonitored (float): fully-monitored network worst coordinate error variance
            n (int): total number of network points
            n_sensors (int): number of deployed sensors
            errorvar_reconstruction_Dopt (list): error variance at each network location obtained by D-optimality (or other) criteria. Defaults to [].
            roi_idx (dict): dictionary containing indices of locations that belong to each ROI. The keys indicate the threshold used to separate the network.
            save_fig (bool, optional): Save generated figure. Defaults to False.

        Returns:
            plt.figure: Figure with error variance curves
        """
        if type(variance_threshold_ratio) is float:
            variance_threshold = variance_threshold_ratio*worst_coordinate_variance_fullymonitored
        
            fig = plt.figure()
            ax = fig.add_subplot(111)
            ax.plot(errorvar_fullymonitored,color='#1d8348',label='Fully monitored network')
            if len(errorvar_reconstruction_Dopt) !=0:
                ax.plot(errorvar_reconstruction_Dopt,color='orange',label=f'Joshi-Boyd solution',alpha=0.8)
            ax.plot(errorvar_reconstruction,color='#1a5276',label=f'Network design solution')
            ax.hlines(y=variance_threshold,xmin=0,xmax=n+1,color='k',linestyles='--',label=rf'Design threshold $\rho$={variance_threshold_ratio:.2f}$\rho_n$')
            xrange = np.arange(-1,n,10)
            xrange[0] = 0
            ax.set_xticks(xrange)
            ax.set_xticklabels([i+1 for i in ax.get_xticks()])
            ax.set_xlim(0,n)
            ax.set_xlabel('Location index')
            yrange = np.arange(0,1.75,0.25)
            ax.set_yticks(yrange)
            ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
            ax.set_ylim(0,1.5)
            ax.set_ylabel('Error variance')
            ax.legend(loc='center',ncol=2,framealpha=0.5,bbox_to_anchor=(0.5,1.1))
            fig.tight_layout()
            if save_fig:
                fname = f'{self.save_path}Curve_errorVariance_Threshold{variance_threshold_ratio:.2f}_Nsensors{n_sensors}.png'
                fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
                print(f'Figure saved at {fname}')


        else: # heterogeneous thresholds over multiple ROIs
            variance_threshold = [t*w for t,w in zip(variance_threshold_ratio,worst_coordinate_variance_fullymonitored)]
            # sort coordinate error variance such that the ROIs are shown in order
            coordinate_error_variance_fully_monitored_sorted = np.concatenate([errorvar_fullymonitored[i] for i in roi_idx.values()])
            coordinate_error_variance_design_sorted = np.concatenate([errorvar_reconstruction[i] for i in roi_idx.values()])

            fig = plt.figure(constrained_layout=True)
            ax = fig.add_subplot(111)
            # coordinate error variance at each location
            ax.plot(coordinate_error_variance_fully_monitored_sorted,color='#943126',label='Fully monitored case')
            # horizontal lines showing threshold design
            n_roi = np.concatenate([[0],[len(i) for i in roi_idx.values()]])
            n_roi_cumsum = np.cumsum(n_roi)
            for v,l in zip(variance_threshold,range(len(n_roi_cumsum))):
                if l==0:
                    ax.hlines(y=v,xmin=n_roi_cumsum[l]-1,xmax=n_roi_cumsum[l+1]-1,color='k',linestyles='--',label='Design threshold')
                else:
                    ax.hlines(y=v,xmin=n_roi_cumsum[l],xmax=n_roi_cumsum[l+1]-1,color='k',linestyles='--')
            
            # Joshi Boyd and IRNet results
            if len(errorvar_reconstruction_Dopt) !=0:
                coordinate_error_variance_Dopt_sorted = np.concatenate([errorvar_reconstruction_Dopt[i] for i in roi_idx.values()])
                ax.plot(coordinate_error_variance_Dopt_sorted,color='orange',label=f'JB {n_sensors_Dopt} sensors',alpha=0.8)
            ax.plot(coordinate_error_variance_design_sorted,color='#1a5276',label=f'IRWNet {n_sensors} sensors')
            
            xrange = np.arange(-1,n,10)
            xrange[0] = 0
            ax.set_xticks(xrange)
            ax.set_xticklabels([i+1 for i in ax.get_xticks()])
            ax.set_xlim(-0.5,n)
            ax.set_xlabel('Location index')
            yrange = np.arange(0,3.5,0.5)
            ax.set_yticks(yrange)
            ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
            ax.set_ylim(0,3.0+0.1)
            ax.set_ylabel('Per-coordinate error variance')
            ax.legend(loc='center',ncol=2,framealpha=1,
                      handlelength=0.5,handletextpad=0.1,columnspacing=0.2,
                      bbox_to_anchor=(0.5,0.88))
            #fig.tight_layout()
            if save_fig:
                #fname = f'{self.save_path}Curve_errorVariance_Threshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}_NsensorsROIDopt_{n_sensors_roi}.png'
                if method == 'random_based':
                    fname = f'{self.save_path}Curve_errorVariance_VarThreshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}_randomSeed{random_seed}.png'
                else:
                    fname = f'{self.save_path}Curve_errorVariance_VarThreshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}.png'
                fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
                print(f'Figure saved at {fname}')


    def curve_rmse_hourly(self,rmse_time,month=0,save_fig=False):
        hours = [i for i in rmse_time.keys()]
        median = [rmse_time[i].median().to_numpy()[0] for i in hours]
        q1,q3 = [rmse_time[i].quantile(q=0.25).to_numpy()[0] for i in hours], [rmse_time[i].quantile(q=0.75).to_numpy()[0] for i in hours]

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(median,color='#1a5276')
        ax.fill_between(x=hours,y1=q1,y2=q3,color='#1a5276',alpha=0.5)
        ax.set_xticks(hours[::4])
        ax.set_xticklabels([i for i in ax.get_xticks()])
        ax.set_xlabel('Hour')
        yrange = np.arange(0,12.,2.)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])
        ax.set_ylabel('RMSE ($\mu$g/$m^3$)')
        ax.set_ylim(yrange[0],yrange[-1])
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}deploy_sensors_hourly_month{month}.png'
            fig.savefig(fname,dpi=300,format='png')
        return fig

In [None]:
abs_path = os.path.dirname(os.path.realpath(__file__))
files_path = os.path.abspath(os.path.join(abs_path,os.pardir)) + '/files/catalonia/'
results_path = os.path.abspath(os.path.join(abs_path,os.pardir)) + '/test/'

In [None]:
pollutant = 'O3'
start_date = '2011-01-01'
end_date = '2022-12-31'
N=48
dataset = Dataset(pollutant,N,start_date,end_date,files_path)

In [None]:
dataset.N

48

In [None]:
dataset.files_path

'C:\\Users\\jp_lp\\Documents\\Scripts\\github\\IRNet/files/catalonia/'

In [None]:
files_path

'C:\\Users\\jp_lp\\Documents\\Scripts\\github\\IRNet/files/catalonia/'

In [None]:
dataset.pollutant

'O3'

In [None]:
dataset.start_date

'2011-01-01'

In [None]:
dataset.end_date

'2022-12-31'

In [None]:
dataset.load_dataset()

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv


In [None]:
dataset.check_dataset()

Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juneda          0.0
O3_Lleida          0.0
O3_Ponts           0.0
O3_Montsec         0.0
O3_Sort            0.0
O3_Alcover         0.0
O3

In [None]:
dataset.ds

Unnamed: 0,O3_Badalona,O3_Eixample,O3_Gracia,O3_Ciutadella,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Berga,O3_Gava,O3_Granollers,...,O3_Sort,O3_Alcover,O3_Amposta,O3_La-Senla,O3_Constanti,O3_Gandesa,O3_Els-Guiamets,O3_Reus,O3_Tarragona,O3_Vilaseca
2020-09-24 13:00:00,68.0,57.0,58.0,68.0,80.0,72.0,83.0,65.0,88.0,78.0,...,77.0,72.0,74.0,79.0,70.0,59.0,65.0,71.0,65.0,73.0
2020-09-24 14:00:00,82.0,72.0,79.0,86.0,95.0,99.0,101.0,62.0,104.0,82.0,...,77.0,74.0,67.0,69.0,70.0,49.0,60.0,69.0,62.0,69.0
2020-09-24 15:00:00,84.0,81.0,83.0,87.0,95.0,95.0,101.0,61.0,92.0,74.0,...,78.0,71.0,54.0,58.0,63.0,48.0,53.0,62.0,63.0,66.0
2020-09-24 16:00:00,66.0,62.0,65.0,72.0,80.0,73.0,85.0,60.0,72.0,67.0,...,79.0,62.0,45.0,58.0,60.0,55.0,49.0,65.0,64.0,64.0
2020-09-24 17:00:00,47.0,50.0,53.0,56.0,63.0,59.0,68.0,58.0,60.0,54.0,...,79.0,56.0,50.0,57.0,59.0,56.0,57.0,60.0,60.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 19:00:00,5.0,25.0,29.0,30.0,50.0,25.0,79.0,35.0,23.0,4.0,...,4.0,47.0,14.0,47.0,23.0,57.0,51.0,35.0,4.0,17.0
2022-12-31 20:00:00,3.0,20.0,18.0,31.0,13.0,9.0,82.0,34.0,22.0,4.0,...,3.0,52.0,6.0,47.0,28.0,62.0,42.0,11.0,1.0,3.0
2022-12-31 21:00:00,1.0,11.0,8.0,28.0,6.0,1.0,82.0,35.0,21.0,5.0,...,4.0,52.0,5.0,49.0,6.0,48.0,47.0,13.0,1.0,8.0
2022-12-31 22:00:00,22.0,2.0,2.0,9.0,5.0,2.0,77.0,36.0,21.0,11.0,...,2.0,51.0,12.0,51.0,15.0,33.0,51.0,24.0,1.0,11.0


In [None]:
dataset.coordinates_distances

Name,Badalona,Eixample,Gracia,Ciutadella,Vall-Hebron,Palau-Reial,Fabra,Berga,Gava,Granollers,...,Sort,Alcover,Amposta,La-Senla,Constanti,Gandesa,Els-Guiamets,Reus,Tarragona,Vilaseca
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Badalona,0.0,9.586199,8.669933,7.662893,7.771388,12.024914,9.942444,79.529369,25.879498,17.665233,...,140.888116,90.40999,161.478753,186.488535,91.275756,156.68381,129.955628,99.11381,90.998512,98.768185
Eixample,9.586199,0.0,1.489601,2.812869,4.55663,3.237618,4.447338,83.127814,16.36775,26.181402,...,141.621474,82.38198,152.148496,177.244015,82.503692,148.24847,121.430188,90.439364,82.03295,89.856587
Gracia,8.669933,1.489601,0.0,3.155488,3.074832,3.429714,3.29767,81.700015,17.215382,24.856055,...,140.407434,82.567814,152.850795,177.897673,82.938258,148.573646,121.782376,90.838937,82.546409,90.348557
Ciutadella,7.662893,2.812869,3.155488,0.0,5.504204,6.039978,6.391098,83.910418,18.826352,25.002939,...,143.215273,85.182494,154.666756,179.804942,85.220378,151.009332,124.181512,93.172133,84.705379,92.54101
Vall-Hebron,7.771388,4.55663,3.074832,5.504204,0.0,5.091178,2.188123,78.669577,18.924,22.409576,...,137.711481,82.653611,153.990862,178.927252,83.543236,148.912738,122.188306,91.363358,83.317998,91.067963
Palau-Reial,12.024914,3.237618,3.429714,6.039978,5.091178,0.0,3.512376,81.970133,13.965518,27.499434,...,139.52668,79.220863,149.458235,174.488506,79.512156,145.171156,118.372077,87.417979,79.116904,86.91889
Fabra,9.942444,4.447338,3.29767,6.391098,2.188123,3.512376,0.0,78.873648,16.935244,24.216017,...,137.177916,80.513211,151.823282,176.749451,81.356639,146.743992,120.013185,89.179966,81.130263,88.879841
Berga,79.529369,83.127814,81.700015,83.910418,78.669577,81.970133,78.873648,0.0,89.078898,66.355313,...,68.413396,106.650141,187.315592,207.690205,117.171887,164.711902,143.421782,121.430618,120.062209,124.961448
Gava,25.879498,16.36775,17.215382,18.826352,18.924,13.965518,16.935244,89.078898,0.0,41.082823,...,141.804224,68.036812,135.841955,161.001333,66.931761,133.011644,106.11277,74.986688,66.178448,74.058333
Granollers,17.665233,26.181402,24.856055,25.002939,22.409576,27.499434,24.216017,66.355313,41.082823,0.0,...,131.224492,99.123283,174.079057,198.585314,102.135608,165.892346,139.62737,109.560118,102.513616,110.003787


In [None]:
class Dataset():
    def __init__(self,pollutant:str='O3',N:int=44,start_date:str='2011-01-01',end_date:str='2022-12-31',files_path:str='',synthetic_dataset:bool=False):
        self.pollutant = pollutant
        self.N = N
        self.start_date = start_date
        self.end_date = end_date
        self.files_path = files_path
        self.synthetic_dataset = synthetic_dataset
    
    def load_dataset(self):
        if self.synthetic_dataset:
            fname = f'{self.files_path}SyntheticData_{self.start_date}_{self.end_date}.csv'
        else:
            fname = f'{self.files_path}{self.pollutant}_catalonia_clean_N{self.N}_{self.start_date}_{self.end_date}.csv'
            self.coordinates = pd.read_csv(f'{self.files_path}coordinates.csv',index_col=0)
            self.coordinates_distances = pd.DataFrame([],index=self.coordinates.index,columns=self.coordinates.index)
            for i in range(self.coordinates.shape[0]):
                for j in range(self.coordinates.shape[0]):
                    self.coordinates_distances.iloc[i,j] = geopy.distance.geodesic(self.coordinates.iloc[i,:],self.coordinates.iloc[j,:]).km

        print(f'Loading dataset from {fname}')
        self.ds = pd.read_csv(fname,sep=',',index_col=0)
        self.ds.index = pd.to_datetime(self.ds.index)
        

    def check_dataset(self):
        print(f'Checking missing values in dataset')
        print(f'Percentage of missing values per location:\n{100*self.ds.isna().sum()/self.ds.shape[0]}')
        print(f'Dataset has {self.ds.shape[0]} measurements for {self.ds.shape[1]} locations.\n{self.ds.head()}')

    def sort_stations(self,station_center='Ciutadella'):
        """ Sort order of stations based on distance to one of them"""
            
        self.distances = dataset.coordinates_distances.loc['Ciutadella']
        self.distances.sort_values(ascending=True,inplace=True)
        self.ds = self.ds.loc[:,[f'O3_{i}' for i in self.distances.index if f'O3_{i}' in self.ds.columns]]
        print(f'Order of dataset locations: {self.ds.columns}')

In [None]:
dataset.ds.columns

Index(['O3_Badalona', 'O3_Eixample', 'O3_Gracia', 'O3_Ciutadella',
       'O3_Vall-Hebron', 'O3_Palau-Reial', 'O3_Fabra', 'O3_Berga', 'O3_Gava',
       'O3_Granollers', 'O3_Igualada', 'O3_Manlleu', 'O3_Manresa', 'O3_Mataro',
       'O3_Montcada', 'O3_El-Prat', 'O3_Rubi', 'O3_Sabadell', 'O3_Sant-Adria',
       'O3_Sant-Celoni', 'O3_Sant-Cugat', 'O3_Santa-Maria', 'O3_Sant-Vicenç',
       'O3_Terrassa', 'O3_Tona', 'O3_Vic', 'O3_Viladecans', 'O3_Vilafranca',
       'O3_Vilanova', 'O3_Agullana', 'O3_Begur', 'O3_Pardines', 'O3_Santa-Pau',
       'O3_Bellver', 'O3_Juneda', 'O3_Lleida', 'O3_Ponts', 'O3_Montsec',
       'O3_Sort', 'O3_Alcover', 'O3_Amposta', 'O3_La-Senla', 'O3_Constanti',
       'O3_Gandesa', 'O3_Els-Guiamets', 'O3_Reus', 'O3_Tarragona',
       'O3_Vilaseca'],
      dtype='object')

In [None]:
class Dataset():
    def __init__(self,pollutant:str='O3',N:int=44,start_date:str='2011-01-01',end_date:str='2022-12-31',files_path:str='',synthetic_dataset:bool=False):
        self.pollutant = pollutant
        self.N = N
        self.start_date = start_date
        self.end_date = end_date
        self.files_path = files_path
        self.synthetic_dataset = synthetic_dataset
    
    def load_dataset(self):
        if self.synthetic_dataset:
            fname = f'{self.files_path}SyntheticData_{self.start_date}_{self.end_date}.csv'
        else:
            fname = f'{self.files_path}{self.pollutant}_catalonia_clean_N{self.N}_{self.start_date}_{self.end_date}.csv'
            self.coordinates = pd.read_csv(f'{self.files_path}coordinates.csv',index_col=0)
            self.coordinates_distances = pd.DataFrame([],index=self.coordinates.index,columns=self.coordinates.index)
            for i in range(self.coordinates.shape[0]):
                for j in range(self.coordinates.shape[0]):
                    self.coordinates_distances.iloc[i,j] = geopy.distance.geodesic(self.coordinates.iloc[i,:],self.coordinates.iloc[j,:]).km

        print(f'Loading dataset from {fname}')
        self.ds = pd.read_csv(fname,sep=',',index_col=0)
        self.ds.index = pd.to_datetime(self.ds.index)
        

    def check_dataset(self):
        print(f'Checking missing values in dataset')
        print(f'Percentage of missing values per location:\n{100*self.ds.isna().sum()/self.ds.shape[0]}')
        print(f'Dataset has {self.ds.shape[0]} measurements for {self.ds.shape[1]} locations.\n{self.ds.head()}')

    def sort_stations(self,station_center='Ciutadella'):
        """ Sort order of stations based on distance to one of them"""
        if station_center not in [i for i in self.ds.columns]:
            raise ValueError(f'Station used for center is not present in dataset')
        
        self.distances = dataset.coordinates_distances.loc[station_center]
        self.distances.sort_values(ascending=True,inplace=True)
        self.ds = self.ds.loc[:,[f'O3_{i}' for i in self.distances.index if f'O3_{i}' in self.ds.columns]]
        print(f'Order of dataset locations: {self.ds.columns}')

In [None]:
dataset = Dataset(pollutant,N,start_date,end_date,files_path)

In [None]:
dataset.load_dataset()
dataset.check_dataset()

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv
Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juned

In [None]:
dataset.ds

Unnamed: 0,O3_Badalona,O3_Eixample,O3_Gracia,O3_Ciutadella,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Berga,O3_Gava,O3_Granollers,...,O3_Sort,O3_Alcover,O3_Amposta,O3_La-Senla,O3_Constanti,O3_Gandesa,O3_Els-Guiamets,O3_Reus,O3_Tarragona,O3_Vilaseca
2020-09-24 13:00:00,68.0,57.0,58.0,68.0,80.0,72.0,83.0,65.0,88.0,78.0,...,77.0,72.0,74.0,79.0,70.0,59.0,65.0,71.0,65.0,73.0
2020-09-24 14:00:00,82.0,72.0,79.0,86.0,95.0,99.0,101.0,62.0,104.0,82.0,...,77.0,74.0,67.0,69.0,70.0,49.0,60.0,69.0,62.0,69.0
2020-09-24 15:00:00,84.0,81.0,83.0,87.0,95.0,95.0,101.0,61.0,92.0,74.0,...,78.0,71.0,54.0,58.0,63.0,48.0,53.0,62.0,63.0,66.0
2020-09-24 16:00:00,66.0,62.0,65.0,72.0,80.0,73.0,85.0,60.0,72.0,67.0,...,79.0,62.0,45.0,58.0,60.0,55.0,49.0,65.0,64.0,64.0
2020-09-24 17:00:00,47.0,50.0,53.0,56.0,63.0,59.0,68.0,58.0,60.0,54.0,...,79.0,56.0,50.0,57.0,59.0,56.0,57.0,60.0,60.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 19:00:00,5.0,25.0,29.0,30.0,50.0,25.0,79.0,35.0,23.0,4.0,...,4.0,47.0,14.0,47.0,23.0,57.0,51.0,35.0,4.0,17.0
2022-12-31 20:00:00,3.0,20.0,18.0,31.0,13.0,9.0,82.0,34.0,22.0,4.0,...,3.0,52.0,6.0,47.0,28.0,62.0,42.0,11.0,1.0,3.0
2022-12-31 21:00:00,1.0,11.0,8.0,28.0,6.0,1.0,82.0,35.0,21.0,5.0,...,4.0,52.0,5.0,49.0,6.0,48.0,47.0,13.0,1.0,8.0
2022-12-31 22:00:00,22.0,2.0,2.0,9.0,5.0,2.0,77.0,36.0,21.0,11.0,...,2.0,51.0,12.0,51.0,15.0,33.0,51.0,24.0,1.0,11.0


In [None]:
dataset.sort_stations(station_center='Ciutadella')

ValueError: Station used for center is not present in dataset

In [None]:
dataset.coordinates_distances

Name,Badalona,Eixample,Gracia,Ciutadella,Vall-Hebron,Palau-Reial,Fabra,Berga,Gava,Granollers,...,Sort,Alcover,Amposta,La-Senla,Constanti,Gandesa,Els-Guiamets,Reus,Tarragona,Vilaseca
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Badalona,0.0,9.586199,8.669933,7.662893,7.771388,12.024914,9.942444,79.529369,25.879498,17.665233,...,140.888116,90.40999,161.478753,186.488535,91.275756,156.68381,129.955628,99.11381,90.998512,98.768185
Eixample,9.586199,0.0,1.489601,2.812869,4.55663,3.237618,4.447338,83.127814,16.36775,26.181402,...,141.621474,82.38198,152.148496,177.244015,82.503692,148.24847,121.430188,90.439364,82.03295,89.856587
Gracia,8.669933,1.489601,0.0,3.155488,3.074832,3.429714,3.29767,81.700015,17.215382,24.856055,...,140.407434,82.567814,152.850795,177.897673,82.938258,148.573646,121.782376,90.838937,82.546409,90.348557
Ciutadella,7.662893,2.812869,3.155488,0.0,5.504204,6.039978,6.391098,83.910418,18.826352,25.002939,...,143.215273,85.182494,154.666756,179.804942,85.220378,151.009332,124.181512,93.172133,84.705379,92.54101
Vall-Hebron,7.771388,4.55663,3.074832,5.504204,0.0,5.091178,2.188123,78.669577,18.924,22.409576,...,137.711481,82.653611,153.990862,178.927252,83.543236,148.912738,122.188306,91.363358,83.317998,91.067963
Palau-Reial,12.024914,3.237618,3.429714,6.039978,5.091178,0.0,3.512376,81.970133,13.965518,27.499434,...,139.52668,79.220863,149.458235,174.488506,79.512156,145.171156,118.372077,87.417979,79.116904,86.91889
Fabra,9.942444,4.447338,3.29767,6.391098,2.188123,3.512376,0.0,78.873648,16.935244,24.216017,...,137.177916,80.513211,151.823282,176.749451,81.356639,146.743992,120.013185,89.179966,81.130263,88.879841
Berga,79.529369,83.127814,81.700015,83.910418,78.669577,81.970133,78.873648,0.0,89.078898,66.355313,...,68.413396,106.650141,187.315592,207.690205,117.171887,164.711902,143.421782,121.430618,120.062209,124.961448
Gava,25.879498,16.36775,17.215382,18.826352,18.924,13.965518,16.935244,89.078898,0.0,41.082823,...,141.804224,68.036812,135.841955,161.001333,66.931761,133.011644,106.11277,74.986688,66.178448,74.058333
Granollers,17.665233,26.181402,24.856055,25.002939,22.409576,27.499434,24.216017,66.355313,41.082823,0.0,...,131.224492,99.123283,174.079057,198.585314,102.135608,165.892346,139.62737,109.560118,102.513616,110.003787


In [None]:
dataset.coordinates_distances.columns

Index(['Badalona', 'Eixample', 'Gracia', 'Ciutadella', 'Vall-Hebron',
       'Palau-Reial', 'Fabra', 'Berga', 'Gava', 'Granollers', 'Igualada',
       'Manlleu', 'Manresa', 'Mataro', 'Montcada', 'El-Prat', 'Rubi',
       'Sabadell', 'Sant-Adria', 'Sant-Celoni', 'Sant-Cugat', 'Santa-Maria',
       'Sant-Vicenç', 'Terrassa', 'Tona', 'Vic', 'Viladecans', 'Vilafranca',
       'Vilanova', 'Agullana', 'Begur', 'Pardines', 'Santa-Pau', 'Bellver',
       'Juneda', 'Lleida', 'Ponts', 'Montsec', 'Sort', 'Alcover', 'Amposta',
       'La-Senla', 'Constanti', 'Gandesa', 'Els-Guiamets', 'Reus', 'Tarragona',
       'Vilaseca'],
      dtype='object', name='Name')

In [None]:
class Dataset():
    def __init__(self,pollutant:str='O3',N:int=44,start_date:str='2011-01-01',end_date:str='2022-12-31',files_path:str='',synthetic_dataset:bool=False):
        self.pollutant = pollutant
        self.N = N
        self.start_date = start_date
        self.end_date = end_date
        self.files_path = files_path
        self.synthetic_dataset = synthetic_dataset
    
    def load_dataset(self):
        if self.synthetic_dataset:
            fname = f'{self.files_path}SyntheticData_{self.start_date}_{self.end_date}.csv'
        else:
            fname = f'{self.files_path}{self.pollutant}_catalonia_clean_N{self.N}_{self.start_date}_{self.end_date}.csv'
            self.coordinates = pd.read_csv(f'{self.files_path}coordinates.csv',index_col=0)
            self.coordinates_distances = pd.DataFrame([],index=self.coordinates.index,columns=self.coordinates.index)
            for i in range(self.coordinates.shape[0]):
                for j in range(self.coordinates.shape[0]):
                    self.coordinates_distances.iloc[i,j] = geopy.distance.geodesic(self.coordinates.iloc[i,:],self.coordinates.iloc[j,:]).km

        print(f'Loading dataset from {fname}')
        self.ds = pd.read_csv(fname,sep=',',index_col=0)
        self.ds.index = pd.to_datetime(self.ds.index)
        

    def check_dataset(self):
        print(f'Checking missing values in dataset')
        print(f'Percentage of missing values per location:\n{100*self.ds.isna().sum()/self.ds.shape[0]}')
        print(f'Dataset has {self.ds.shape[0]} measurements for {self.ds.shape[1]} locations.\n{self.ds.head()}')

    def sort_stations(self,station_center='Ciutadella'):
        """ Sort order of stations based on distance to one of them"""
        if station_center not in [i for i in self.coordinates_distances.columns]:
            raise ValueError(f'Station used for center is not present in dataset')

        self.distances = dataset.coordinates_distances.loc[station_center]
        self.distances.sort_values(ascending=True,inplace=True)
        self.ds = self.ds.loc[:,[f'O3_{i}' for i in self.distances.index if f'O3_{i}' in self.ds.columns]]
        print(f'Order of dataset locations: {self.ds.columns}')

In [None]:
dataset = Dataset(pollutant,N,start_date,end_date,files_path)
dataset.load_dataset()
dataset.check_dataset()
dataset.sort_stations(station_center='Ciutadella')

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv
Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juned

In [None]:
dataset.ds

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,68.0,57.0,58.0,76.0,80.0,72.0,83.0,68.0,64.0,81.0,...,82.0,66.0,65.0,78.0,58.0,78.0,77.0,59.0,74.0,79.0
2020-09-24 14:00:00,86.0,72.0,79.0,99.0,95.0,99.0,101.0,82.0,77.0,107.0,...,80.0,65.0,60.0,83.0,65.0,82.0,77.0,49.0,67.0,69.0
2020-09-24 15:00:00,87.0,81.0,83.0,97.0,95.0,95.0,101.0,84.0,72.0,92.0,...,82.0,50.0,53.0,84.0,53.0,81.0,78.0,48.0,54.0,58.0
2020-09-24 16:00:00,72.0,62.0,65.0,79.0,80.0,73.0,85.0,66.0,65.0,74.0,...,77.0,55.0,49.0,69.0,57.0,85.0,79.0,55.0,45.0,58.0
2020-09-24 17:00:00,56.0,50.0,53.0,61.0,63.0,59.0,68.0,47.0,55.0,65.0,...,83.0,60.0,57.0,73.0,57.0,88.0,79.0,56.0,50.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 19:00:00,30.0,25.0,29.0,18.0,50.0,25.0,79.0,5.0,26.0,47.0,...,44.0,6.0,51.0,26.0,1.0,82.0,4.0,57.0,14.0,47.0
2022-12-31 20:00:00,31.0,20.0,18.0,13.0,13.0,9.0,82.0,3.0,4.0,36.0,...,42.0,9.0,42.0,27.0,1.0,82.0,3.0,62.0,6.0,47.0
2022-12-31 21:00:00,28.0,11.0,8.0,6.0,6.0,1.0,82.0,1.0,3.0,2.0,...,14.0,11.0,47.0,29.0,1.0,81.0,4.0,48.0,5.0,49.0
2022-12-31 22:00:00,9.0,2.0,2.0,3.0,5.0,2.0,77.0,22.0,3.0,3.0,...,15.0,23.0,51.0,26.0,1.0,80.0,2.0,33.0,12.0,51.0


In [None]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10
X_train, X_test = train_test_split(dataset.ds, test_size= 1 - train_ratio,shuffle=False,random_state=92)
X_val, X_test = train_test_split(X_test, test_size=test_ratio/(test_ratio + validation_ratio),shuffle=False,random_state=92) 
print(f'Dataset matrix summary:\n {train_ratio} of dataset for training set with {X_train.shape[0]} measurements from {X_train.index[0]} until {X_train.index[-1]}\n {validation_ratio} of dataset for validation set with {X_val.shape[0]} measurements from {X_val.index[0]} until {X_val.index[-1]}\n {test_ratio} of measuerements for testing set with {X_test.shape[0]} measurements from {X_test.index[0]} until {X_test.index[-1]}')

Dataset matrix summary:
 0.75 of dataset for training set with 5646 measurements from 2020-09-24 13:00:00 until 2022-05-07 12:00:00
 0.15 of dataset for validation set with 1129 measurements from 2022-05-07 13:00:00 until 2022-09-20 07:00:00
 0.1 of measuerements for testing set with 753 measurements from 2022-09-20 15:00:00 until 2022-12-31 23:00:00


In [None]:
plots = Figures(save_path=results_path,
                figx=3.5,figy=2.5,
                marker_size=1,
                fs_label=15,fs_ticks=10,fs_legend=6,fs_title=10,
                show_plots=True)

Setting mpl rcparams


In [None]:
plots.boxplot_measurements(X_train,save_fig=False)

In [None]:
plots.geographical_network_visualization(map_path=f'{files_path}ll_autonomicas_inspire_peninbal_etrs89/',df_coordinates=dataset.coordinates.reindex(dataset.distances.index),
                                            show_legend=True,show_deployed_sensors=False,save_fig=False)



<Figure size 1050x750 with 1 Axes>

In [None]:
snapshots_matrix_train = X_train.to_numpy().T
snapshots_matrix_val = X_val.to_numpy().T
snapshots_matrix_test = X_test.to_numpy().T
snapshots_matrix_train_centered = snapshots_matrix_train - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_val_centered = snapshots_matrix_val - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_test_centered = snapshots_matrix_test - snapshots_matrix_train.mean(axis=1)[:,None]
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train_centered,full_matrices=False)

In [None]:
U.shape

(48, 48)

In [None]:
Vt.shape

(48, 5646)

In [None]:
sing_vals.shape

(48,)

In [None]:
print('\nDetermine signal sparsity from SVD decomposition.\nUse singular values ratios, cumulative energy, or reconstruction error for validation set.')
s_range = np.arange(1,sing_vals.shape[0]+1,1)
rmse_sparsity_train = signal_reconstruction_svd(U,snapshots_matrix_train,snapshots_matrix_train_centered,X_train,s_range)
rmse_sparsity_val = signal_reconstruction_svd(U,snapshots_matrix_train,snapshots_matrix_val_centered,X_val,s_range)
"""O3 Envea device: sigma=0.1ppb=1*1.96 ug/m3"""
rmse_threshold = (1*1.96)**2
signal_sparsity = np.argwhere(rmse_sparsity_val.median(axis=0).to_numpy()<=rmse_threshold)[0][0] + 1
print(f'Reconstruction error is lower than specified threshold {rmse_threshold} in validation set at sparsity of {signal_sparsity}.\nTraining set error of {rmse_sparsity_train.median(axis=0)[signal_sparsity]:.2f}\nValidation set error of {rmse_sparsity_val.median(axis=0)[signal_sparsity]:.2f}\nSingular value ratio: {sing_vals[signal_sparsity]/sing_vals[0]:.2f}\nCumulative energy: {(sing_vals.cumsum()/sing_vals.sum())[signal_sparsity]:.2f}')


Determine signal sparsity from SVD decomposition.
Use singular values ratios, cumulative energy, or reconstruction error for validation set.
Determining signal sparsity by decomposing training set and reconstructing validation set.
Range of sparsity levels: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48]
Determining signal sparsity by decomposing training set and reconstructing validation set.
Range of sparsity levels: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48]
Reconstruction error is lower than specified threshold 3.8415999999999997 in validation set at sparsity of 44.
Training set error of 2.24
Validation set error of 3.62
Singular value ratio: 0.04
Cumulative energy: 0.98


In [None]:
cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
energy_threshold = 0.9
signal_sparsity_energy = np.where(cumulative_energy>=energy_threshold)[0][0]
print(f'Energy threshold of {energy_threshold} reached at singular at singular value index: {signal_sparsity_energy}')

Energy threshold of 0.9 reached at singular at singular value index: 36


In [None]:
fig_rmse_sparsity_val = plots.boxplot_validation_rmse_svd(rmse_sparsity_val,n=X_train.shape[1],max_sparsity_show=sing_vals.shape[0],synthetic_dataset=synthetic_dataset,save_fig=False)

NameError: name 'synthetic_dataset' is not defined

In [None]:
plots = Figures(save_path=results_path,marker_size=1,
                figx=3.5,figy=2.5,
                fs_label=13,fs_ticks=13,fs_legend=10,fs_title=10,
                show_plots=True)

Setting mpl rcparams


In [None]:
plots.singular_values_cumulative_energy(sing_vals,n = X_train.shape[1],synthetic_dataset=synthetic_dataset,save_fig=False)
plots.singular_values_cumulative_energy_sameFigure(sing_vals,n=X_train.shape[1],save_fig=False)

NameError: name 'synthetic_dataset' is not defined

In [None]:
plots.singular_values_cumulative_energy(sing_vals,n = X_train.shape[1],save_fig=False)
plots.singular_values_cumulative_energy_sameFigure(sing_vals,n=X_train.shape[1],save_fig=False)

In [None]:
fig_rmse_sparsity_val = plots.boxplot_validation_rmse_svd(rmse_sparsity_val,n=X_train.shape[1],max_sparsity_show=sing_vals.shape[0],save_fig=False)

In [None]:
signal_sparsity_hard_threshold = singular_value_hard_threshold(snapshots_matrix_train_centered,sing_vals)

In [None]:
signal_sparsity_hard_threshold

11

In [None]:
signal_sparsity_hard_threshold

11

In [None]:
cumulative_energy

array([0.21883735, 0.26948976, 0.31666068, 0.35773434, 0.39036916,
       0.41892548, 0.44551163, 0.4699467 , 0.4937151 , 0.51684317,
       0.53942861, 0.55966604, 0.57893366, 0.59758271, 0.61495165,
       0.63185912, 0.64840801, 0.66434772, 0.68007792, 0.69504263,
       0.70977883, 0.72438388, 0.73839369, 0.75197683, 0.76514356,
       0.77780906, 0.79028833, 0.80252316, 0.81445441, 0.82622002,
       0.83776026, 0.84904385, 0.8598556 , 0.87039313, 0.88083843,
       0.89112704, 0.90108601, 0.9109445 , 0.92076433, 0.93029401,
       0.93976443, 0.94913262, 0.95812993, 0.96682129, 0.97539564,
       0.98383447, 0.99206988, 1.        ])

In [None]:
cumulative_energy[signal_sparsity_hard_threshold]

0.5596660382170773

In [None]:
dataset.coordinates

Unnamed: 0_level_0,Latitude,Longitude
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Badalona,41.443985,2.237899
Eixample,41.385315,2.1538
Gracia,41.398724,2.153399
Ciutadella,41.386406,2.187398
Vall-Hebron,41.42611,2.148002
Palau-Reial,41.38749,2.1152
Fabra,41.41843,2.123897
Berga,42.0979,1.848201
Gava,41.303097,1.991498
Granollers,41.598682,2.287098


In [None]:
dataset.coordinates_distances

Name,Badalona,Eixample,Gracia,Ciutadella,Vall-Hebron,Palau-Reial,Fabra,Berga,Gava,Granollers,...,Sort,Alcover,Amposta,La-Senla,Constanti,Gandesa,Els-Guiamets,Reus,Tarragona,Vilaseca
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Badalona,0.0,9.586199,8.669933,7.662893,7.771388,12.024914,9.942444,79.529369,25.879498,17.665233,...,140.888116,90.40999,161.478753,186.488535,91.275756,156.68381,129.955628,99.11381,90.998512,98.768185
Eixample,9.586199,0.0,1.489601,2.812869,4.55663,3.237618,4.447338,83.127814,16.36775,26.181402,...,141.621474,82.38198,152.148496,177.244015,82.503692,148.24847,121.430188,90.439364,82.03295,89.856587
Gracia,8.669933,1.489601,0.0,3.155488,3.074832,3.429714,3.29767,81.700015,17.215382,24.856055,...,140.407434,82.567814,152.850795,177.897673,82.938258,148.573646,121.782376,90.838937,82.546409,90.348557
Ciutadella,7.662893,2.812869,3.155488,0.0,5.504204,6.039978,6.391098,83.910418,18.826352,25.002939,...,143.215273,85.182494,154.666756,179.804942,85.220378,151.009332,124.181512,93.172133,84.705379,92.54101
Vall-Hebron,7.771388,4.55663,3.074832,5.504204,0.0,5.091178,2.188123,78.669577,18.924,22.409576,...,137.711481,82.653611,153.990862,178.927252,83.543236,148.912738,122.188306,91.363358,83.317998,91.067963
Palau-Reial,12.024914,3.237618,3.429714,6.039978,5.091178,0.0,3.512376,81.970133,13.965518,27.499434,...,139.52668,79.220863,149.458235,174.488506,79.512156,145.171156,118.372077,87.417979,79.116904,86.91889
Fabra,9.942444,4.447338,3.29767,6.391098,2.188123,3.512376,0.0,78.873648,16.935244,24.216017,...,137.177916,80.513211,151.823282,176.749451,81.356639,146.743992,120.013185,89.179966,81.130263,88.879841
Berga,79.529369,83.127814,81.700015,83.910418,78.669577,81.970133,78.873648,0.0,89.078898,66.355313,...,68.413396,106.650141,187.315592,207.690205,117.171887,164.711902,143.421782,121.430618,120.062209,124.961448
Gava,25.879498,16.36775,17.215382,18.826352,18.924,13.965518,16.935244,89.078898,0.0,41.082823,...,141.804224,68.036812,135.841955,161.001333,66.931761,133.011644,106.11277,74.986688,66.178448,74.058333
Granollers,17.665233,26.181402,24.856055,25.002939,22.409576,27.499434,24.216017,66.355313,41.082823,0.0,...,131.224492,99.123283,174.079057,198.585314,102.135608,165.892346,139.62737,109.560118,102.513616,110.003787


In [None]:
dataset.ds.columns

Index(['O3_Ciutadella', 'O3_Eixample', 'O3_Gracia', 'O3_Sant-Adria',
       'O3_Vall-Hebron', 'O3_Palau-Reial', 'O3_Fabra', 'O3_Badalona',
       'O3_Montcada', 'O3_El-Prat', 'O3_Sant-Cugat', 'O3_Sant-Vicenç',
       'O3_Viladecans', 'O3_Rubi', 'O3_Gava', 'O3_Sabadell', 'O3_Terrassa',
       'O3_Granollers', 'O3_Mataro', 'O3_Santa-Maria', 'O3_Vilafranca',
       'O3_Sant-Celoni', 'O3_Vilanova', 'O3_Manresa', 'O3_Tona', 'O3_Igualada',
       'O3_Vic', 'O3_Manlleu', 'O3_Berga', 'O3_Tarragona', 'O3_Alcover',
       'O3_Constanti', 'O3_Santa-Pau', 'O3_Vilaseca', 'O3_Reus', 'O3_Ponts',
       'O3_Pardines', 'O3_Begur', 'O3_Bellver', 'O3_Juneda', 'O3_Els-Guiamets',
       'O3_Agullana', 'O3_Lleida', 'O3_Montsec', 'O3_Sort', 'O3_Gandesa',
       'O3_Amposta', 'O3_La-Senla'],
      dtype='object')

In [None]:
[i for i in dataset.coordinates.columns]

['Latitude', 'Longitude']

In [None]:
dataset.coordinates

Unnamed: 0_level_0,Latitude,Longitude
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Badalona,41.443985,2.237899
Eixample,41.385315,2.1538
Gracia,41.398724,2.153399
Ciutadella,41.386406,2.187398
Vall-Hebron,41.42611,2.148002
Palau-Reial,41.38749,2.1152
Fabra,41.41843,2.123897
Berga,42.0979,1.848201
Gava,41.303097,1.991498
Granollers,41.598682,2.287098


In [None]:
dataset.ds

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,68.0,57.0,58.0,76.0,80.0,72.0,83.0,68.0,64.0,81.0,...,82.0,66.0,65.0,78.0,58.0,78.0,77.0,59.0,74.0,79.0
2020-09-24 14:00:00,86.0,72.0,79.0,99.0,95.0,99.0,101.0,82.0,77.0,107.0,...,80.0,65.0,60.0,83.0,65.0,82.0,77.0,49.0,67.0,69.0
2020-09-24 15:00:00,87.0,81.0,83.0,97.0,95.0,95.0,101.0,84.0,72.0,92.0,...,82.0,50.0,53.0,84.0,53.0,81.0,78.0,48.0,54.0,58.0
2020-09-24 16:00:00,72.0,62.0,65.0,79.0,80.0,73.0,85.0,66.0,65.0,74.0,...,77.0,55.0,49.0,69.0,57.0,85.0,79.0,55.0,45.0,58.0
2020-09-24 17:00:00,56.0,50.0,53.0,61.0,63.0,59.0,68.0,47.0,55.0,65.0,...,83.0,60.0,57.0,73.0,57.0,88.0,79.0,56.0,50.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 19:00:00,30.0,25.0,29.0,18.0,50.0,25.0,79.0,5.0,26.0,47.0,...,44.0,6.0,51.0,26.0,1.0,82.0,4.0,57.0,14.0,47.0
2022-12-31 20:00:00,31.0,20.0,18.0,13.0,13.0,9.0,82.0,3.0,4.0,36.0,...,42.0,9.0,42.0,27.0,1.0,82.0,3.0,62.0,6.0,47.0
2022-12-31 21:00:00,28.0,11.0,8.0,6.0,6.0,1.0,82.0,1.0,3.0,2.0,...,14.0,11.0,47.0,29.0,1.0,81.0,4.0,48.0,5.0,49.0
2022-12-31 22:00:00,9.0,2.0,2.0,3.0,5.0,2.0,77.0,22.0,3.0,3.0,...,15.0,23.0,51.0,26.0,1.0,80.0,2.0,33.0,12.0,51.0


In [None]:
dataset.ds.loc[:,'O3_Sant-Vicenç']

2020-09-24 13:00:00    63.0
2020-09-24 14:00:00    74.0
2020-09-24 15:00:00    81.0
2020-09-24 16:00:00    68.0
2020-09-24 17:00:00    58.0
                       ... 
2022-12-31 19:00:00     7.0
2022-12-31 20:00:00     3.0
2022-12-31 21:00:00     2.0
2022-12-31 22:00:00     2.0
2022-12-31 23:00:00     6.0
Name: O3_Sant-Vicenç, Length: 7528, dtype: float64

In [None]:
dataset.files_path

'C:\\Users\\jp_lp\\Documents\\Scripts\\github\\IRNet/files/catalonia/'

In [None]:
class Dataset():
    def __init__(self,pollutant:str='O3',N:int=44,start_date:str='2011-01-01',end_date:str='2022-12-31',files_path:str='',synthetic_dataset:bool=False):
        self.pollutant = pollutant
        self.N = N
        self.start_date = start_date
        self.end_date = end_date
        self.files_path = files_path
        self.synthetic_dataset = synthetic_dataset
    
    def load_dataset(self):
        if self.synthetic_dataset:
            fname = f'{self.files_path}SyntheticData_{self.start_date}_{self.end_date}.csv'
        else:
            fname = f'{self.files_path}{self.pollutant}_catalonia_clean_N{self.N}_{self.start_date}_{self.end_date}.csv'
            self.stations_types = pd.read_csv(f'{self.files_path}stations_types.csv',index_col=0)
            self.coordinates = pd.read_csv(f'{self.files_path}coordinates.csv',index_col=0)
            self.coordinates_distances = pd.DataFrame([],index=self.coordinates.index,columns=self.coordinates.index)
            for i in range(self.coordinates.shape[0]):
                for j in range(self.coordinates.shape[0]):
                    self.coordinates_distances.iloc[i,j] = geopy.distance.geodesic(self.coordinates.iloc[i,:],self.coordinates.iloc[j,:]).km
            
        print(f'Loading dataset from {fname}')
        self.ds = pd.read_csv(fname,sep=',',index_col=0)
        self.ds.index = pd.to_datetime(self.ds.index)
        

    def check_dataset(self):
        print(f'Checking missing values in dataset')
        print(f'Percentage of missing values per location:\n{100*self.ds.isna().sum()/self.ds.shape[0]}')
        print(f'Dataset has {self.ds.shape[0]} measurements for {self.ds.shape[1]} locations.\n{self.ds.head()}')

    def sort_stations(self,station_center='Ciutadella'):
        """ Sort order of stations based on distance to one of them"""
        if station_center not in [i for i in self.coordinates_distances.columns]:
            raise ValueError(f'Station used for center is not present in dataset')

        self.distances = dataset.coordinates_distances.loc[station_center]
        self.distances.sort_values(ascending=True,inplace=True)
        self.ds = self.ds.loc[:,[f'O3_{i}' for i in self.distances.index if f'O3_{i}' in self.ds.columns]]
        print(f'Order of dataset locations: {self.ds.columns}')

In [None]:
dataset = Dataset(pollutant,N,start_date,end_date,files_path)
dataset.load_dataset()
dataset.check_dataset()
dataset.sort_stations(station_center='Ciutadella')

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jp_lp\\Documents\\Scripts\\github\\IRNet/files/catalonia/stations_types.csv'

In [None]:
dataset = Dataset(pollutant,N,start_date,end_date,files_path)

In [None]:
dataset.load_dataset()
dataset.check_dataset()
dataset.sort_stations(station_center='Ciutadella')

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv
Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juned

In [None]:
dataset.coordinates

Unnamed: 0_level_0,Latitude,Longitude
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Badalona,41.443985,2.237899
Eixample,41.385315,2.1538
Gracia,41.398724,2.153399
Ciutadella,41.386406,2.187398
Vall-Hebron,41.42611,2.148002
Palau-Reial,41.38749,2.1152
Fabra,41.41843,2.123897
Berga,42.0979,1.848201
Gava,41.303097,1.991498
Granollers,41.598682,2.287098


In [None]:
dataset.coordinates_distances

Name,Badalona,Eixample,Gracia,Ciutadella,Vall-Hebron,Palau-Reial,Fabra,Berga,Gava,Granollers,...,Sort,Alcover,Amposta,La-Senla,Constanti,Gandesa,Els-Guiamets,Reus,Tarragona,Vilaseca
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Badalona,0.0,9.586199,8.669933,7.662893,7.771388,12.024914,9.942444,79.529369,25.879498,17.665233,...,140.888116,90.40999,161.478753,186.488535,91.275756,156.68381,129.955628,99.11381,90.998512,98.768185
Eixample,9.586199,0.0,1.489601,2.812869,4.55663,3.237618,4.447338,83.127814,16.36775,26.181402,...,141.621474,82.38198,152.148496,177.244015,82.503692,148.24847,121.430188,90.439364,82.03295,89.856587
Gracia,8.669933,1.489601,0.0,3.155488,3.074832,3.429714,3.29767,81.700015,17.215382,24.856055,...,140.407434,82.567814,152.850795,177.897673,82.938258,148.573646,121.782376,90.838937,82.546409,90.348557
Ciutadella,7.662893,2.812869,3.155488,0.0,5.504204,6.039978,6.391098,83.910418,18.826352,25.002939,...,143.215273,85.182494,154.666756,179.804942,85.220378,151.009332,124.181512,93.172133,84.705379,92.54101
Vall-Hebron,7.771388,4.55663,3.074832,5.504204,0.0,5.091178,2.188123,78.669577,18.924,22.409576,...,137.711481,82.653611,153.990862,178.927252,83.543236,148.912738,122.188306,91.363358,83.317998,91.067963
Palau-Reial,12.024914,3.237618,3.429714,6.039978,5.091178,0.0,3.512376,81.970133,13.965518,27.499434,...,139.52668,79.220863,149.458235,174.488506,79.512156,145.171156,118.372077,87.417979,79.116904,86.91889
Fabra,9.942444,4.447338,3.29767,6.391098,2.188123,3.512376,0.0,78.873648,16.935244,24.216017,...,137.177916,80.513211,151.823282,176.749451,81.356639,146.743992,120.013185,89.179966,81.130263,88.879841
Berga,79.529369,83.127814,81.700015,83.910418,78.669577,81.970133,78.873648,0.0,89.078898,66.355313,...,68.413396,106.650141,187.315592,207.690205,117.171887,164.711902,143.421782,121.430618,120.062209,124.961448
Gava,25.879498,16.36775,17.215382,18.826352,18.924,13.965518,16.935244,89.078898,0.0,41.082823,...,141.804224,68.036812,135.841955,161.001333,66.931761,133.011644,106.11277,74.986688,66.178448,74.058333
Granollers,17.665233,26.181402,24.856055,25.002939,22.409576,27.499434,24.216017,66.355313,41.082823,0.0,...,131.224492,99.123283,174.079057,198.585314,102.135608,165.892346,139.62737,109.560118,102.513616,110.003787


In [None]:
dataset.stations_types

Unnamed: 0_level_0,Area
Name,Unnamed: 1_level_1
Badalona,urban
Eixample,urban
Gracia,urban
Ciutadella,urban
Vall-Hebron,urban
Palau-Reial,urban
Fabra,suburban
Berga,suburban
Gava,suburban
Granollers,urban


In [None]:
dataset = Dataset(pollutant,N,start_date,end_date,files_path)
dataset.load_dataset()
dataset.check_dataset()
dataset.sort_stations(station_center='Ciutadella')

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv
Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juned

In [None]:
dataset.stations_types

Unnamed: 0_level_0,Area
Name,Unnamed: 1_level_1
Badalona,urban
Eixample,urban
Gracia,urban
Ciutadella,urban
Vall-Hebron,urban
Palau-Reial,urban
Fabra,suburban
Berga,suburban
Gava,suburban
Granollers,urban


In [None]:
dataset.stations_types.unique()

AttributeError: 'DataFrame' object has no attribute 'unique'

In [None]:
dataset.stations_types

Unnamed: 0_level_0,Area
Name,Unnamed: 1_level_1
Badalona,urban
Eixample,urban
Gracia,urban
Ciutadella,urban
Vall-Hebron,urban
Palau-Reial,urban
Fabra,suburban
Berga,suburban
Gava,suburban
Granollers,urban


In [None]:
dataset.stations_types.plot()

TypeError: no numeric data to plot

In [None]:
dataset.stations_types.value_counts

<bound method DataFrame.value_counts of                   Area
Name                  
Badalona         urban
Eixample         urban
Gracia           urban
Ciutadella       urban
Vall-Hebron      urban
Palau-Reial      urban
Fabra         suburban
Berga         suburban
Gava          suburban
Granollers       urban
Igualada      suburban
Manlleu       suburban
Manresa          urban
Mataro           urban
Montcada      suburban
El-Prat       suburban
Rubi             urban
Sabadell         urban
Sant-Adria       urban
Sant-Celoni   suburban
Sant-Cugat       urban
Santa-Maria      rural
Sant-Vicenç   suburban
Terrassa         urban
Tona             rural
Vic           suburban
Viladecans    suburban
Vilafranca    suburban
Vilanova      suburban
Agullana         rural
Begur            rural
Pardines         rural
Santa-Pau        rural
Bellver          rural
Juneda           rural
Lleida           urban
Ponts            rural
Montsec          rural
Sort             rural
Alcover          

In [None]:
dataset.stations_types.value_counts()

Area    
suburban    17
urban       16
rural       15
Name: count, dtype: int64

In [None]:
dataset.stations_types.value_counts().sum()

48

In [None]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10
X_train, X_test = train_test_split(dataset.ds, test_size= 1 - train_ratio,shuffle=False,random_state=92)
X_val, X_test = train_test_split(X_test, test_size=test_ratio/(test_ratio + validation_ratio),shuffle=False,random_state=92) 
print(f'Dataset matrix summary:\n {train_ratio} of dataset for training set with {X_train.shape[0]} measurements from {X_train.index[0]} until {X_train.index[-1]}\n {validation_ratio} of dataset for validation set with {X_val.shape[0]} measurements from {X_val.index[0]} until {X_val.index[-1]}\n {test_ratio} of measuerements for testing set with {X_test.shape[0]} measurements from {X_test.index[0]} until {X_test.index[-1]}')

Dataset matrix summary:
 0.75 of dataset for training set with 5646 measurements from 2020-09-24 13:00:00 until 2022-05-07 12:00:00
 0.15 of dataset for validation set with 1129 measurements from 2022-05-07 13:00:00 until 2022-09-20 07:00:00
 0.1 of measuerements for testing set with 753 measurements from 2022-09-20 15:00:00 until 2022-12-31 23:00:00


In [None]:
snapshots_matrix_train = X_train.to_numpy().T
snapshots_matrix_val = X_val.to_numpy().T
snapshots_matrix_test = X_test.to_numpy().T
snapshots_matrix_train_centered = snapshots_matrix_train - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_val_centered = snapshots_matrix_val - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_test_centered = snapshots_matrix_test - snapshots_matrix_train.mean(axis=1)[:,None]
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train_centered,full_matrices=False)
print(f'Training snapshots matrix has dimensions {snapshots_matrix_train_centered.shape}.\nLeft singular vectors matrix has dimensions {U.shape}\nRight singular vectors matrix has dimensions {Vt.shape}\nNumber of singular values: {sing_vals.shape}')

Training snapshots matrix has dimensions (48, 5646).
Left singular vectors matrix has dimensions (48, 48)
Right singular vectors matrix has dimensions (48, 5646)
Number of singular values: (48,)


In [None]:
print('\nDetermine signal sparsity from SVD decomposition.\nUse singular values ratios, cumulative energy, or reconstruction error for validation set.')
s_range = np.arange(1,sing_vals.shape[0]+1,1)
rmse_sparsity_train = signal_reconstruction_svd(U,snapshots_matrix_train,snapshots_matrix_train_centered,X_train,s_range)
rmse_sparsity_val = signal_reconstruction_svd(U,snapshots_matrix_train,snapshots_matrix_val_centered,X_val,s_range)
"""O3 Envea device: sigma=0.1ppb=1*1.96 ug/m3"""
rmse_threshold = (1*1.96)**2
signal_sparsity = np.argwhere(rmse_sparsity_val.median(axis=0).to_numpy()<=rmse_threshold)[0][0] + 1
print(f'Reconstruction error is lower than specified threshold {rmse_threshold} in validation set at sparsity of {signal_sparsity}.\nTraining set error of {rmse_sparsity_train.median(axis=0)[signal_sparsity]:.2f}\nValidation set error of {rmse_sparsity_val.median(axis=0)[signal_sparsity]:.2f}\nSingular value ratio: {sing_vals[signal_sparsity]/sing_vals[0]:.2f}\nCumulative energy: {(sing_vals.cumsum()/sing_vals.sum())[signal_sparsity]:.2f}')


Determine signal sparsity from SVD decomposition.
Use singular values ratios, cumulative energy, or reconstruction error for validation set.
Determining signal sparsity by decomposing training set and reconstructing validation set.
Range of sparsity levels: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48]
Determining signal sparsity by decomposing training set and reconstructing validation set.
Range of sparsity levels: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48]
Reconstruction error is lower than specified threshold 3.8415999999999997 in validation set at sparsity of 44.
Training set error of 2.24
Validation set error of 3.62
Singular value ratio: 0.04
Cumulative energy: 0.98


In [None]:
cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
energy_threshold = 0.9
signal_sparsity_energy = np.where(cumulative_energy>=energy_threshold)[0][0]
print(f'Energy threshold of {energy_threshold} reached at singular at singular value index: {signal_sparsity_energy}')

Energy threshold of 0.9 reached at singular at singular value index: 36


In [None]:
signal_sparsity_hard_threshold = singular_value_hard_threshold(snapshots_matrix_train_centered,sing_vals)

In [None]:
signal_sparsity_hard_threshold

11

In [None]:
snapshots_matrix = snapshots_matrix_train_centered

In [None]:
beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)
sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

In [None]:
sparsity_gd

11

In [None]:
beta = beta**-1

In [None]:
beta

117.625

In [None]:
snapshots_matrix.shape

(48, 5646)

In [None]:
5646/48

117.625

In [None]:
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)
sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

IndexError: index -1 is out of bounds for axis 0 with size 0

In [None]:
sing_val_threshold

680055409.5540125

In [None]:
sing_vals

array([12384.8880509 ,  2866.6237969 ,  2669.59275818,  2324.5239868 ,
        1846.93626938,  1616.11711251,  1504.61751521,  1382.87919198,
        1345.14934948,  1308.91096636,  1278.20106559,  1145.31754148,
        1090.43239258,  1055.42482276,   982.97849541,   956.86188154,
         936.56817642,   902.09243949,   890.23564918,   846.91333482,
         833.98068927,   826.5587583 ,   792.87169014,   768.72492517,
         745.15811423,   716.79194554,   706.25202422,   692.41858028,
         675.23756504,   665.86282124,   653.10901894,   638.58380899,
         611.88033993,   596.36145398,   591.14138749,   582.27418458,
         563.61827231,   557.93173948,   555.7436168 ,   539.32304205,
         535.96922385,   530.18398407,   509.19356328,   491.87950199,
         485.25685339,   477.587175  ,   466.07545918,   448.7973367 ])

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)

array([], shape=(0, 1), dtype=int64)

In [None]:
beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]

In [None]:
sparsity_gd

11

In [None]:
Psi.shape

NameError: name 'Psi' is not defined

In [None]:
print(f'Hard-threshold singular value: {signal_sparsity_hard_threshold}')

Hard-threshold singular value: 11


In [None]:
snapshots_matrix_train

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
S

NameError: name 'S' is not defined

In [None]:
U,S,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)

In [None]:
S

array([28456.0740296 ,  5818.15193871,  2846.72914491,  2517.9198153 ,
        1929.49808429,  1639.23774115,  1504.61870213,  1478.73816596,
        1382.51342987,  1328.90053859,  1303.20420585,  1152.81564605,
        1090.59444542,  1057.61443352,  1000.68155549,   966.00709694,
         956.10111867,   909.50403505,   898.1246297 ,   853.78099203,
         835.87077567,   827.62429964,   798.70648941,   772.68046481,
         752.15422557,   729.6399529 ,   710.97312123,   692.63046664,
         681.36330102,   668.98542556,   657.36029746,   639.89623964,
         612.48344355,   607.86691596,   592.16175795,   584.63550058,
         565.27382431,   558.0286537 ,   556.50072604,   540.56066461,
         537.07012627,   530.22005481,   510.19166776,   492.80934462,
         486.09294499,   478.81058096,   466.32916673,   449.07416505])

In [None]:
sing_vals

array([12384.8880509 ,  2866.6237969 ,  2669.59275818,  2324.5239868 ,
        1846.93626938,  1616.11711251,  1504.61751521,  1382.87919198,
        1345.14934948,  1308.91096636,  1278.20106559,  1145.31754148,
        1090.43239258,  1055.42482276,   982.97849541,   956.86188154,
         936.56817642,   902.09243949,   890.23564918,   846.91333482,
         833.98068927,   826.5587583 ,   792.87169014,   768.72492517,
         745.15811423,   716.79194554,   706.25202422,   692.41858028,
         675.23756504,   665.86282124,   653.10901894,   638.58380899,
         611.88033993,   596.36145398,   591.14138749,   582.27418458,
         563.61827231,   557.93173948,   555.7436168 ,   539.32304205,
         535.96922385,   530.18398407,   509.19356328,   491.87950199,
         485.25685339,   477.587175  ,   466.07545918,   448.7973367 ])

In [None]:
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)

In [None]:
singular_value_hard_threshold(snapshots_matrix_train_centered,sing_vals)

11

In [None]:
X_train

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,68.0,57.0,58.0,76.0,80.0,72.0,83.0,68.0,64.0,81.0,...,82.0,66.0,65.0,78.0,58.0,78.0,77.0,59.0,74.0,79.0
2020-09-24 14:00:00,86.0,72.0,79.0,99.0,95.0,99.0,101.0,82.0,77.0,107.0,...,80.0,65.0,60.0,83.0,65.0,82.0,77.0,49.0,67.0,69.0
2020-09-24 15:00:00,87.0,81.0,83.0,97.0,95.0,95.0,101.0,84.0,72.0,92.0,...,82.0,50.0,53.0,84.0,53.0,81.0,78.0,48.0,54.0,58.0
2020-09-24 16:00:00,72.0,62.0,65.0,79.0,80.0,73.0,85.0,66.0,65.0,74.0,...,77.0,55.0,49.0,69.0,57.0,85.0,79.0,55.0,45.0,58.0
2020-09-24 17:00:00,56.0,50.0,53.0,61.0,63.0,59.0,68.0,47.0,55.0,65.0,...,83.0,60.0,57.0,73.0,57.0,88.0,79.0,56.0,50.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,47.0,65.0,57.0,48.0,75.0,76.0,88.0,47.0,21.0,49.0,...,58.0,66.0,52.0,100.0,51.0,85.0,113.0,74.0,78.0,88.0
2022-05-07 09:00:00,69.0,58.0,57.0,56.0,73.0,73.0,84.0,45.0,45.0,55.0,...,102.0,69.0,66.0,102.0,54.0,85.0,110.0,75.0,78.0,89.0
2022-05-07 10:00:00,61.0,53.0,45.0,62.0,69.0,69.0,83.0,50.0,61.0,61.0,...,99.0,71.0,69.0,104.0,54.0,85.0,111.0,78.0,80.0,87.0
2022-05-07 11:00:00,55.0,58.0,51.0,70.0,75.0,75.0,86.0,67.0,73.0,72.0,...,98.0,74.0,75.0,107.0,59.0,87.0,112.0,79.0,79.0,89.0


In [None]:
dataset.ds

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,68.0,57.0,58.0,76.0,80.0,72.0,83.0,68.0,64.0,81.0,...,82.0,66.0,65.0,78.0,58.0,78.0,77.0,59.0,74.0,79.0
2020-09-24 14:00:00,86.0,72.0,79.0,99.0,95.0,99.0,101.0,82.0,77.0,107.0,...,80.0,65.0,60.0,83.0,65.0,82.0,77.0,49.0,67.0,69.0
2020-09-24 15:00:00,87.0,81.0,83.0,97.0,95.0,95.0,101.0,84.0,72.0,92.0,...,82.0,50.0,53.0,84.0,53.0,81.0,78.0,48.0,54.0,58.0
2020-09-24 16:00:00,72.0,62.0,65.0,79.0,80.0,73.0,85.0,66.0,65.0,74.0,...,77.0,55.0,49.0,69.0,57.0,85.0,79.0,55.0,45.0,58.0
2020-09-24 17:00:00,56.0,50.0,53.0,61.0,63.0,59.0,68.0,47.0,55.0,65.0,...,83.0,60.0,57.0,73.0,57.0,88.0,79.0,56.0,50.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 19:00:00,30.0,25.0,29.0,18.0,50.0,25.0,79.0,5.0,26.0,47.0,...,44.0,6.0,51.0,26.0,1.0,82.0,4.0,57.0,14.0,47.0
2022-12-31 20:00:00,31.0,20.0,18.0,13.0,13.0,9.0,82.0,3.0,4.0,36.0,...,42.0,9.0,42.0,27.0,1.0,82.0,3.0,62.0,6.0,47.0
2022-12-31 21:00:00,28.0,11.0,8.0,6.0,6.0,1.0,82.0,1.0,3.0,2.0,...,14.0,11.0,47.0,29.0,1.0,81.0,4.0,48.0,5.0,49.0
2022-12-31 22:00:00,9.0,2.0,2.0,3.0,5.0,2.0,77.0,22.0,3.0,3.0,...,15.0,23.0,51.0,26.0,1.0,80.0,2.0,33.0,12.0,51.0


In [None]:
snapshots_matrix

array([[ 26.13195182,  44.13195182,  45.13195182, ...,  19.13195182,
         13.13195182,  28.13195182],
       [ 18.68065887,  33.68065887,  42.68065887, ...,  14.68065887,
         19.68065887,   4.68065887],
       [ 15.80198371,  36.80198371,  40.80198371, ...,   2.80198371,
          8.80198371,   7.80198371],
       ...,
       [ -5.15356004, -15.15356004, -16.15356004, ...,  13.84643996,
         14.84643996,  16.84643996],
       [ 20.70120439,  13.70120439,   0.70120439, ...,  26.70120439,
         25.70120439,  30.70120439],
       [  6.47768332,  -3.52231668, -14.52231668, ...,  14.47768332,
         16.47768332,  14.47768332]])

In [None]:
snapshots_matrix_train

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
snapshots_matrix_full = dataset.ds.to_numpy().T

In [None]:
snapshots_matrix_full.shape

(48, 7528)

In [None]:
snapshtos_matrix_train.shape

NameError: name 'snapshtos_matrix_train' is not defined

In [None]:
snapshots_matrix_train.shape

(48, 5646)

In [None]:
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_full,full_matrices=False)

In [None]:
U.shape

(48, 48)

In [None]:
sing_vals.shape

(48,)

In [None]:
sing_vals

array([33876.08769582,  6725.59897475,  3363.06049558,  2869.35826467,
        2225.83869134,  1934.12495062,  1789.45440347,  1661.02756698,
        1596.30049713,  1532.78598889,  1506.8120964 ,  1372.56522078,
        1259.20116278,  1248.5360968 ,  1186.3146679 ,  1138.46624255,
        1116.10724696,  1089.76609808,  1050.97361496,   995.08456115,
         991.58203837,   967.48530575,   938.09007898,   892.30419152,
         867.06137546,   847.86361235,   823.31536552,   809.23492557,
         802.61787182,   782.69534034,   781.43986367,   767.7300943 ,
         724.12445366,   715.95167676,   702.45300302,   676.99180481,
         667.90403101,   663.02541866,   660.54343636,   643.34311082,
         628.04450469,   620.20016636,   610.41569988,   595.62435841,
         575.6011888 ,   562.38378689,   554.98801081,   534.4401553 ])

In [None]:
singular_value_hard_threshold(snapshots_matrix_train_centered,sing_vals)

11

In [None]:
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train_centered,full_matrices=False)

In [None]:
signal_sparsity_hard_threshold = singular_value_hard_threshold(snapshots_matrix_train_centered,sing_vals)

In [None]:
signal_sparsity_hard_threshold

11

In [None]:
plt.plot(sing_vals)

[<matplotlib.lines.Line2D at 0x22f123e0f90>]

In [None]:
plt.plot(np.log(sing_vals))

[<matplotlib.lines.Line2D at 0x22f14ca4050>]

In [None]:
cumulative_energy

array([0.21883735, 0.26948976, 0.31666068, 0.35773434, 0.39036916,
       0.41892548, 0.44551163, 0.4699467 , 0.4937151 , 0.51684317,
       0.53942861, 0.55966604, 0.57893366, 0.59758271, 0.61495165,
       0.63185912, 0.64840801, 0.66434772, 0.68007792, 0.69504263,
       0.70977883, 0.72438388, 0.73839369, 0.75197683, 0.76514356,
       0.77780906, 0.79028833, 0.80252316, 0.81445441, 0.82622002,
       0.83776026, 0.84904385, 0.8598556 , 0.87039313, 0.88083843,
       0.89112704, 0.90108601, 0.9109445 , 0.92076433, 0.93029401,
       0.93976443, 0.94913262, 0.95812993, 0.96682129, 0.97539564,
       0.98383447, 0.99206988, 1.        ])

In [None]:
np.log(cumulative_energy)

array([-1.51942651, -1.3112249 , -1.14992449, -1.02796464, -0.94066242,
       -0.87006223, -0.80853192, -0.75513599, -0.70579665, -0.66001579,
       -0.61724483, -0.58041503, -0.54656739, -0.51486258, -0.48621163,
       -0.45908882, -0.43323514, -0.4089496 , -0.3855479 , -0.3637821 ,
       -0.34280187, -0.32243381, -0.30327814, -0.28504976, -0.2676918 ,
       -0.2512742 , -0.23535743, -0.21999456, -0.20523682, -0.19089418,
       -0.1770233 , -0.16364444, -0.15099081, -0.1388103 , -0.12688107,
       -0.11526828, -0.10415457, -0.0932733 , -0.08255116, -0.0722546 ,
       -0.06212604, -0.05220674, -0.04277189, -0.03374161, -0.02491211,
       -0.01629762, -0.00796173,  0.        ])

In [None]:
plt.plot(np.log(sing_vals))

[<matplotlib.lines.Line2D at 0x22f14354cd0>]

In [None]:
Psi.shape

NameError: name 'Psi' is not defined

In [None]:
signal_sparsity_hard_threshold

11

In [None]:
Psi = U[:,:signal_sparsity_hard_threshold]

In [None]:
Psi.shape

(48, 11)

In [None]:
Psi@Psi.T@snapshots_matrix_train_centered

array([[28.37664858, 48.03982208, 49.84247794, ..., 17.0414734 ,
        19.87207263, 23.71133242],
       [20.56357222, 37.2263938 , 42.01108129, ...,  9.65660147,
        11.91374383, 15.61340448],
       [20.50613737, 38.09502017, 43.12226562, ..., 10.26501548,
        13.0616528 , 17.38018372],
       ...,
       [ 2.33210944, -1.19319995, -6.26161578, ..., 14.58653988,
        13.80838848, 15.18353181],
       [16.77341254, 11.43891201,  3.21515938, ..., 28.94107252,
        30.62068502, 36.89795727],
       [ 4.84742539,  2.68920551, -2.93935556, ..., 21.14419496,
        23.25901033, 25.00639234]])

In [None]:
Psi@np.diag(sing_vals[:signal_sparsity_hard_threshold])

array([[-1.91158912e+03, -6.17301629e+02,  6.82205248e+01,
         1.35245942e+02, -1.69286564e+02,  3.36323761e+02,
        -3.67647182e+02,  6.88751302e+01, -7.28055552e+00,
         1.37853207e+02, -5.66639363e+01],
       [-1.57900638e+03, -5.87408352e+02, -1.13532462e+02,
        -1.58874194e+02, -1.54757803e+02,  2.94687944e+02,
        -3.04232128e+02,  4.60726275e+01, -2.80081027e+02,
         4.34317674e+01, -2.47614899e+01],
       [-1.56186072e+03, -6.31709753e+02, -7.27254662e+01,
        -1.83393083e+02, -2.18725422e+02,  2.27526075e+02,
        -3.69399376e+02,  4.35459062e+01, -2.05761830e+02,
        -2.54953557e+00, -2.32844972e+01],
       [-2.28884477e+03, -5.00161741e+02,  3.67609603e+02,
         2.48208269e+02, -4.96570944e+01,  3.36855496e+02,
        -1.76076802e+02, -1.58777166e+02,  2.48171488e+02,
         1.00024027e+02, -2.45372598e+02],
       [-1.80181535e+03, -5.07119179e+02,  2.44993284e+01,
        -1.90968926e+02, -3.95239819e+02, -1.58585606e+01,
  

In [None]:
Vt.shape

(48, 5646)

In [None]:
Psi@np.diag(sing_vals[:signal_sparsity_hard_threshold])@Vt[:signal_sparsity_hard_threshold,:]

array([[28.37664858, 48.03982208, 49.84247794, ..., 17.0414734 ,
        19.87207263, 23.71133242],
       [20.56357222, 37.2263938 , 42.01108129, ...,  9.65660147,
        11.91374383, 15.61340448],
       [20.50613737, 38.09502017, 43.12226562, ..., 10.26501548,
        13.0616528 , 17.38018372],
       ...,
       [ 2.33210944, -1.19319995, -6.26161578, ..., 14.58653988,
        13.80838848, 15.18353181],
       [16.77341254, 11.43891201,  3.21515938, ..., 28.94107252,
        30.62068502, 36.89795727],
       [ 4.84742539,  2.68920551, -2.93935556, ..., 21.14419496,
        23.25901033, 25.00639234]])

In [None]:
Psi@np.diag(sing_vals[:signal_sparsity_hard_threshold])@Vt[:signal_sparsity_hard_threshold,:]

array([[28.37664858, 48.03982208, 49.84247794, ..., 17.0414734 ,
        19.87207263, 23.71133242],
       [20.56357222, 37.2263938 , 42.01108129, ...,  9.65660147,
        11.91374383, 15.61340448],
       [20.50613737, 38.09502017, 43.12226562, ..., 10.26501548,
        13.0616528 , 17.38018372],
       ...,
       [ 2.33210944, -1.19319995, -6.26161578, ..., 14.58653988,
        13.80838848, 15.18353181],
       [16.77341254, 11.43891201,  3.21515938, ..., 28.94107252,
        30.62068502, 36.89795727],
       [ 4.84742539,  2.68920551, -2.93935556, ..., 21.14419496,
        23.25901033, 25.00639234]])

In [None]:
np.allclose(Psi@np.diag(sing_vals[:signal_sparsity_hard_threshold])@Vt[:signal_sparsity_hard_threshold,:],Psi@Psi.T@snapshots_matrix_train_centered)

True

In [None]:
beta

0.008501594048884165

In [None]:
beta = 0.10

In [None]:
t1 = 2*(beta+1)

In [None]:
t2 = (8*beta)/((beta+1)+np.sqrt((beta**2+14*beta+1)))

In [None]:
lambda_beta = np.sqrt(t1+t2)

In [None]:
lambda_beta

1.5816483953196852

In [None]:
beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]

In [None]:
snapshots_matrix.shape

(48, 5646)

In [None]:
snapshots_matrix_train.shape

(48, 5646)

In [None]:
beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]

In [None]:
betaç

NameError: name 'betaç' is not defined

In [None]:
beta

0.008501594048884165

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta)/((beta+1)+np.sqrt((beta**2+14*beta+1)))
lambda_beta = np.sqrt(t1+t2)

In [None]:
sparsity_gd = lambda_beta*noise*np.sqrt(snapshots_matrix.shape[1])

NameError: name 'noise' is not defined

In [None]:
noise

NameError: name 'noise' is not defined

In [None]:
noise = 0.1*ppb

NameError: name 'ppb' is not defined

In [None]:
ppb = 1.96

In [None]:
0.1*ppb

0.196

In [None]:
noise = (0.1*ppb)**2

In [None]:
noise

0.038416000000000006

In [None]:
sparsity_gd = lambda_beta*noise*np.sqrt(snapshots_matrix.shape[1])

In [None]:
sparsity_gd

4.132860474217467

In [None]:
lambda_beta

1.4317532054115514

In [None]:
lambda_beta*np.sqrt(noise*snapshots_matrix.shape[1])

21.086022827640136

In [None]:
lambda_beta*np.sqrt(noise*snapshots_matrix.shape[0])

1.9442175278071667

In [None]:
lambda_beta*noise*np.sqrt(snapshots_matrix.shape[0])

0.3810666354502047

In [None]:
max(2,3)

3

In [None]:
max(snapshots_matrix.shape)

5646

In [None]:
sparsity_gd = lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))

In [None]:
sparsity_gd

4.132860474217467

In [None]:
sing_vals[:sparsity_gd]

TypeError: slice indices must be integers or None or have an __index__ method

In [None]:
int(sparsity_gd)

4

In [None]:
sparsity_gd = int(sparsity_gd)

In [None]:
sparsity_gd

4

In [None]:
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)
sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

In [None]:
sparsity_gd

11

In [None]:
Psi = U[:,:sparsity_gd]

In [None]:
Psi

array([[-0.15434852, -0.215341  ,  0.02555466,  0.05818221, -0.09165804,
         0.20810606, -0.24434594,  0.0498056 , -0.00541245,  0.10531901,
        -0.044331  ],
       [-0.1274946 , -0.20491295, -0.04252801, -0.06834698, -0.08379163,
         0.18234319, -0.20219898,  0.03331645, -0.20821556,  0.03318161,
        -0.01937214],
       [-0.1261102 , -0.22036716, -0.02724216, -0.0788949 , -0.11842608,
         0.14078564, -0.24551048,  0.03148931, -0.15296579, -0.00194783,
        -0.01821662],
       [-0.18480948, -0.17447764,  0.1377025 ,  0.10677811, -0.0268862 ,
         0.20843508, -0.11702429, -0.11481637,  0.18449363,  0.07641775,
        -0.19196714],
       [-0.14548499, -0.17690468,  0.00917718, -0.08215399, -0.21399754,
        -0.00981275, -0.00167648,  0.13090189, -0.07723022, -0.12952212,
        -0.10156199],
       [-0.13245756, -0.21308699, -0.04677631, -0.1012768 , -0.17535509,
         0.03266509, -0.10097679,  0.06563172, -0.1527322 , -0.06926127,
         0.035

In [None]:
snapshots_matrix_reconstructed = Psi@Psi.T@snapshots_matrix_train_centered

In [None]:
snapshots_matrix_train_centered

array([[ 26.13195182,  44.13195182,  45.13195182, ...,  19.13195182,
         13.13195182,  28.13195182],
       [ 18.68065887,  33.68065887,  42.68065887, ...,  14.68065887,
         19.68065887,   4.68065887],
       [ 15.80198371,  36.80198371,  40.80198371, ...,   2.80198371,
          8.80198371,   7.80198371],
       ...,
       [ -5.15356004, -15.15356004, -16.15356004, ...,  13.84643996,
         14.84643996,  16.84643996],
       [ 20.70120439,  13.70120439,   0.70120439, ...,  26.70120439,
         25.70120439,  30.70120439],
       [  6.47768332,  -3.52231668, -14.52231668, ...,  14.47768332,
         16.47768332,  14.47768332]])

In [None]:
snapshots_matrix_reconstructed

array([[28.37664858, 48.03982208, 49.84247794, ..., 17.0414734 ,
        19.87207263, 23.71133242],
       [20.56357222, 37.2263938 , 42.01108129, ...,  9.65660147,
        11.91374383, 15.61340448],
       [20.50613737, 38.09502017, 43.12226562, ..., 10.26501548,
        13.0616528 , 17.38018372],
       ...,
       [ 2.33210944, -1.19319995, -6.26161578, ..., 14.58653988,
        13.80838848, 15.18353181],
       [16.77341254, 11.43891201,  3.21515938, ..., 28.94107252,
        30.62068502, 36.89795727],
       [ 4.84742539,  2.68920551, -2.93935556, ..., 21.14419496,
        23.25901033, 25.00639234]])

In [None]:
mean

NameError: name 'mean' is not defined

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(snapshots_matrix_train_centered,snapshots_matrix_reconstructed)

72.60122908537252

In [None]:
plt.plot(sing_vals)

[<matplotlib.lines.Line2D at 0x22f14d20d50>]

In [None]:
plt.plot(np.log(sing_vals))

[<matplotlib.lines.Line2D at 0x22f1918d450>]

In [None]:
snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix_train_centered) + snapshots_matrix_train.mean(axis=1)[:,None]

In [None]:
X_pred_svd = pd.DataFrame(snapshots_matrix_pred_svd.T)
X_pred_svd.columns = X_dataset.columns
X_pred_svd.index = X_dataset.index

NameError: name 'X_dataset' is not defined

In [None]:
X_dataset = X_train

In [None]:
X_pred_svd = pd.DataFrame(snapshots_matrix_pred_svd.T)
X_pred_svd.columns = X_dataset.columns
X_pred_svd.index = X_dataset.index

In [None]:
error = X_dataset - X_pred_svd

In [None]:
error

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,-2.244697,-1.882913,-4.704154,3.444375,3.359821,-2.665101,3.242487,0.484321,-0.184410,0.579165,...,2.877926,1.758549,1.112524,0.138081,-2.456695,-5.931652,-1.843494,-7.485669,3.927792,1.630258
2020-09-24 14:00:00,-3.907870,-3.545735,-1.293036,8.122316,1.599789,7.620237,11.008230,-1.342175,1.788480,9.386004,...,-0.562098,5.691470,-0.881984,4.877463,7.854157,-3.604601,-1.313830,-13.960360,2.262292,-6.211522
2020-09-24 15:00:00,-4.710526,0.669578,-2.320282,6.837584,2.552147,1.688222,8.192162,-2.433900,0.837070,4.394382,...,2.875097,-1.016786,-1.703290,6.875131,1.373554,-8.334972,-1.027988,-9.891944,-2.513955,-11.582961
2020-09-24 16:00:00,-2.419019,-1.340422,-2.148953,3.870203,7.259985,-0.193979,3.357574,-3.913399,2.710948,0.193339,...,-3.389905,0.572424,-8.365602,-1.639846,3.686353,-2.847270,4.728050,-3.213755,-8.113705,-8.737648
2020-09-24 17:00:00,-3.141120,0.299095,0.744572,1.443809,4.194356,0.340086,-4.706525,-6.908505,1.163660,2.783829,...,0.070105,0.618922,-3.285017,2.863482,0.393866,2.214081,1.001500,-4.747109,-3.605376,-10.414835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,-9.878063,6.801414,-4.464166,5.012281,6.753188,3.804650,9.085192,-5.645617,-6.914880,9.709069,...,-6.430799,11.698444,-14.146598,-1.284833,6.133137,-9.225711,5.446827,-5.440483,3.828897,-0.632372
2022-05-07 09:00:00,6.301471,2.054927,-2.322215,1.608717,-0.284366,1.085384,4.951061,-10.520799,3.753427,-6.130908,...,18.603900,6.342861,-4.358549,-2.129222,3.643346,-8.395900,-8.773820,-4.793019,1.113226,-2.731802
2022-05-07 10:00:00,2.090478,5.024057,-7.463032,0.179904,-2.971150,2.454355,5.898819,-10.782641,8.717399,-6.661137,...,8.676965,3.515473,-2.610092,2.470307,-0.871961,-6.927247,-2.023071,-0.740100,-2.239868,-6.666512
2022-05-07 11:00:00,-6.740121,7.766915,-4.259669,-0.015081,-4.738894,3.458744,1.550553,-1.884960,9.909515,-3.814944,...,3.120072,3.994498,3.861678,3.030031,-0.379352,-7.913707,0.381691,1.038051,-4.919481,-6.781327


In [None]:
mse = pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)

NameError: name 's' is not defined

In [None]:
s = sparsity_gd

In [None]:
s

11

In [None]:
mse = pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)

In [None]:
mse

Unnamed: 0,11
2020-09-24 13:00:00,18.100394
2020-09-24 14:00:00,27.045276
2020-09-24 15:00:00,46.784910
2020-09-24 16:00:00,35.317908
2020-09-24 17:00:00,28.934244
...,...
2022-05-07 08:00:00,58.101237
2022-05-07 09:00:00,44.429807
2022-05-07 10:00:00,45.179215
2022-05-07 11:00:00,35.002320


In [None]:
mse.mean()

11    72.601229
dtype: float64

In [None]:
mean_squared_error(X_dataset,X_pred_svd)

72.60122908537254

In [None]:
error_variance = error.var(axis=0,ddof=0)

In [None]:
error_variance

O3_Ciutadella       82.976223
O3_Eixample         54.970461
O3_Gracia           53.588882
O3_Sant-Adria       54.175549
O3_Vall-Hebron      91.882776
O3_Palau-Reial      71.737552
O3_Fabra            76.569617
O3_Badalona         61.266468
O3_Montcada         57.690186
O3_El-Prat          71.426429
O3_Sant-Cugat       50.456398
O3_Sant-Vicenç      62.324816
O3_Viladecans       40.965682
O3_Rubi             76.320968
O3_Gava             46.737490
O3_Sabadell         78.864119
O3_Terrassa         66.289464
O3_Granollers       80.348671
O3_Mataro           84.696759
O3_Santa-Maria      55.682414
O3_Vilafranca       83.938459
O3_Sant-Celoni      63.290960
O3_Vilanova         89.731777
O3_Manresa          69.994772
O3_Tona             81.629855
O3_Igualada         75.077782
O3_Vic              68.908674
O3_Manlleu          73.059723
O3_Berga            74.325562
O3_Tarragona        69.404525
O3_Alcover          93.981805
O3_Constanti        58.770707
O3_Santa-Pau        83.701654
O3_Vilasec

In [None]:
error_variance.mean()

72.60122908537252

In [None]:
error.mean().mean()

1.0487406065652679e-16

In [None]:
error.mean(axis=0).mean()

1.0487406065652679e-16

In [None]:
error.mean(axis=1).mean()

1.0193758695814402e-16

In [None]:
(error**2).mean(axis=1).mean()

72.60122908537252

In [None]:
(error**2).mean(axis=0).mean()

72.60122908537254

In [None]:
noise

0.038416000000000006

In [None]:
(0.1*ppb)**2

0.038416000000000006

In [None]:
noise

0.038416000000000006

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sparsity_gd = lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))

In [None]:
sparsity_gd

4.132860474217467

In [None]:
sparsity_gd = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sparsity_gd

4

In [None]:
s = sparsity_gd

In [None]:
Psi = U[:,:s]

In [None]:
Psi.shape

(48, 4)

In [None]:
Psi.shape

(48, 4)

In [None]:
Psi

array([[-0.15434852, -0.215341  ,  0.02555466,  0.05818221],
       [-0.1274946 , -0.20491295, -0.04252801, -0.06834698],
       [-0.1261102 , -0.22036716, -0.02724216, -0.0788949 ],
       [-0.18480948, -0.17447764,  0.1377025 ,  0.10677811],
       [-0.14548499, -0.17690468,  0.00917718, -0.08215399],
       [-0.13245756, -0.21308699, -0.04677631, -0.1012768 ],
       [-0.070118  , -0.02962604, -0.0545831 , -0.46190127],
       [-0.16487182, -0.1670671 ,  0.05818345, -0.0346562 ],
       [-0.16940714, -0.05291946,  0.15970181,  0.07412046],
       [-0.18311312, -0.1579286 ,  0.09090321,  0.17331298],
       [-0.18183989, -0.06761354,  0.08370092,  0.05510975],
       [-0.16717473, -0.05796793,  0.09436869,  0.06014793],
       [-0.15492054, -0.15473854, -0.0649019 , -0.08138948],
       [-0.18131372, -0.09250356,  0.07243072,  0.0464518 ],
       [-0.16036146, -0.12552603, -0.00526367, -0.01622941],
       [-0.14072122, -0.0792501 ,  0.01397454, -0.12162493],
       [-0.12109908, -0.

In [None]:
Psi.shape

(48, 4)

In [None]:
snapshots_matrix_centered = snapshots_matrix_train_centered

In [None]:
snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix_centered) + snapshots_matrix_train.mean(axis=1)[:,None]

In [None]:
X_pred_svd = pd.DataFrame(snapshots_matrix_pred_svd.T)
X_pred_svd.columns = X_dataset.columns
X_pred_svd.index = X_dataset.index

In [None]:
X_pred_svd

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,70.866437,58.409798,62.473207,77.627921,74.757729,71.150907,80.120123,71.145205,66.154215,80.090989,...,74.516365,66.983223,71.814752,72.241113,60.268576,88.018773,61.838067,71.795275,67.495071,78.318920
2020-09-24 14:00:00,84.982714,71.119687,76.128871,93.485730,88.590639,84.694783,88.989101,85.948961,77.999300,93.422692,...,69.970465,64.774579,70.464920,68.566213,60.608554,88.206930,49.829929,69.851608,65.047861,76.646234
2020-09-24 15:00:00,81.471807,68.965272,74.292698,90.048947,86.856936,82.808698,92.851191,83.796921,75.583677,88.674215,...,65.560064,59.736725,67.471503,67.285633,56.748833,90.610905,44.241920,67.024833,58.891582,74.824216
2020-09-24 16:00:00,66.510772,54.125734,58.721507,75.174481,71.613445,66.920616,81.160294,68.401965,65.263336,75.775845,...,71.468413,58.014186,64.455232,67.621795,53.158964,87.725604,54.625461,63.736575,56.740121,73.558630
2020-09-24 17:00:00,53.865016,42.196646,46.026909,61.898766,59.115531,54.195168,72.701391,55.441214,55.918471,64.466391,...,76.881743,59.103607,64.402434,70.158482,52.246215,86.978204,65.451471,63.891064,57.395682,74.219895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,43.862131,43.491565,46.459728,37.397052,54.289243,56.583174,86.196869,43.347882,29.204977,42.550326,...,49.032601,63.733287,78.462860,80.325027,54.247066,100.643134,56.595726,83.410359,69.749791,84.051336
2022-05-07 09:00:00,51.228188,46.247898,49.040786,50.018811,59.300785,58.946045,82.048454,51.289027,42.625162,55.412800,...,67.103922,72.227876,80.843619,82.994985,62.173047,97.834792,70.545246,84.279777,74.553197,85.510231
2022-05-07 10:00:00,55.363842,46.786815,49.590080,59.190264,62.189404,59.209879,79.627329,56.703683,53.314360,64.099979,...,81.535190,76.715187,80.780148,84.371824,66.791347,96.183347,80.414604,82.824109,74.974742,85.471011
2022-05-07 11:00:00,62.772088,52.659962,55.819165,70.203126,70.319084,65.570716,86.748066,66.427152,64.585577,73.312271,...,89.247820,80.961217,82.913596,87.280768,72.233830,99.642041,83.402676,84.487772,74.950099,86.788222


In [None]:
X_train

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,68.0,57.0,58.0,76.0,80.0,72.0,83.0,68.0,64.0,81.0,...,82.0,66.0,65.0,78.0,58.0,78.0,77.0,59.0,74.0,79.0
2020-09-24 14:00:00,86.0,72.0,79.0,99.0,95.0,99.0,101.0,82.0,77.0,107.0,...,80.0,65.0,60.0,83.0,65.0,82.0,77.0,49.0,67.0,69.0
2020-09-24 15:00:00,87.0,81.0,83.0,97.0,95.0,95.0,101.0,84.0,72.0,92.0,...,82.0,50.0,53.0,84.0,53.0,81.0,78.0,48.0,54.0,58.0
2020-09-24 16:00:00,72.0,62.0,65.0,79.0,80.0,73.0,85.0,66.0,65.0,74.0,...,77.0,55.0,49.0,69.0,57.0,85.0,79.0,55.0,45.0,58.0
2020-09-24 17:00:00,56.0,50.0,53.0,61.0,63.0,59.0,68.0,47.0,55.0,65.0,...,83.0,60.0,57.0,73.0,57.0,88.0,79.0,56.0,50.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,47.0,65.0,57.0,48.0,75.0,76.0,88.0,47.0,21.0,49.0,...,58.0,66.0,52.0,100.0,51.0,85.0,113.0,74.0,78.0,88.0
2022-05-07 09:00:00,69.0,58.0,57.0,56.0,73.0,73.0,84.0,45.0,45.0,55.0,...,102.0,69.0,66.0,102.0,54.0,85.0,110.0,75.0,78.0,89.0
2022-05-07 10:00:00,61.0,53.0,45.0,62.0,69.0,69.0,83.0,50.0,61.0,61.0,...,99.0,71.0,69.0,104.0,54.0,85.0,111.0,78.0,80.0,87.0
2022-05-07 11:00:00,55.0,58.0,51.0,70.0,75.0,75.0,86.0,67.0,73.0,72.0,...,98.0,74.0,75.0,107.0,59.0,87.0,112.0,79.0,79.0,89.0


In [None]:
error = X_dataset - X_pred_svd
mse = pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)

In [None]:
error_variance = error.var(axis=0,ddof=0) # estiamted coordinate error variance

In [None]:
error

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,-2.866437,-1.409798,-4.473207,-1.627921,5.242271,0.849093,2.879877,-3.145205,-2.154215,0.909011,...,7.483635,-0.983223,-6.814752,5.758887,-2.268576,-10.018773,15.161933,-12.795275,6.504929,0.681080
2020-09-24 14:00:00,1.017286,0.880313,2.871129,5.514270,6.409361,14.305217,12.010899,-3.948961,-0.999300,13.577308,...,10.029535,0.225421,-10.464920,14.433787,4.391446,-6.206930,27.170071,-20.851608,1.952139,-7.646234
2020-09-24 15:00:00,5.528193,12.034728,8.707302,6.951053,8.143064,12.191302,8.148809,0.203079,-3.583677,3.325785,...,16.439936,-9.736725,-14.471503,16.714367,-3.748833,-9.610905,33.758080,-19.024833,-4.891582,-16.824216
2020-09-24 16:00:00,5.489228,7.874266,6.278493,3.825519,8.386555,6.079384,3.839706,-2.401965,-0.263336,-1.775845,...,5.531587,-3.014186,-15.455232,1.378205,3.841036,-2.725604,24.374539,-8.736575,-11.740121,-15.558630
2020-09-24 17:00:00,2.134984,7.803354,6.973091,-0.898766,3.884469,4.804832,-4.701391,-8.441214,-0.918471,0.533609,...,6.118257,0.896393,-7.402434,2.841518,4.753785,1.021796,13.548529,-7.891064,-7.395682,-17.219895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,3.137869,21.508435,10.540272,10.602948,20.710757,19.416826,1.803131,3.652118,-8.204977,6.449674,...,8.967399,2.266713,-26.462860,19.674973,-3.247066,-15.643134,56.404274,-9.410359,8.250209,3.948664
2022-05-07 09:00:00,17.771812,11.752102,7.959214,5.981189,13.699215,14.053955,1.951546,-6.289027,2.374838,-0.412800,...,34.896078,-3.227876,-14.843619,19.005015,-8.173047,-12.834792,39.454754,-9.279777,3.446803,3.489769
2022-05-07 10:00:00,5.636158,6.213185,-4.590080,2.809736,6.810596,9.790121,3.372671,-6.703683,7.685640,-3.099979,...,17.464810,-5.715187,-11.780148,19.628176,-12.791347,-11.183347,30.585396,-4.824109,5.025258,1.528989
2022-05-07 11:00:00,-7.772088,5.340038,-4.819165,-0.203126,4.680916,9.429284,-0.748066,0.572848,8.414423,-1.312271,...,8.752180,-6.961217,-7.913596,19.719232,-13.233830,-12.642041,28.597324,-5.487772,4.049901,2.211778


In [None]:
mse

Unnamed: 0,4
2020-09-24 13:00:00,38.591466
2020-09-24 14:00:00,76.455611
2020-09-24 15:00:00,112.688437
2020-09-24 16:00:00,62.174073
2020-09-24 17:00:00,46.687411
...,...
2022-05-07 08:00:00,198.458829
2022-05-07 09:00:00,180.940441
2022-05-07 10:00:00,121.954529
2022-05-07 11:00:00,109.946119


In [None]:
mse.mean()

4    129.262691
dtype: float64

In [None]:
error_variance

O3_Ciutadella      136.810293
O3_Eixample        105.699414
O3_Gracia          103.331650
O3_Sant-Adria      108.010527
O3_Vall-Hebron     135.387428
O3_Palau-Reial     105.653723
O3_Fabra            90.858704
O3_Badalona        109.743171
O3_Montcada         75.395332
O3_El-Prat         113.166207
O3_Sant-Cugat       78.942156
O3_Sant-Vicenç      91.601650
O3_Viladecans      129.792243
O3_Rubi            109.752019
O3_Gava            166.587472
O3_Sabadell        117.804202
O3_Terrassa        145.092515
O3_Granollers      131.306719
O3_Mataro          149.289994
O3_Santa-Maria     126.667570
O3_Vilafranca      108.686446
O3_Sant-Celoni     133.834764
O3_Vilanova        117.115638
O3_Manresa         114.698217
O3_Tona            123.372867
O3_Igualada        124.379275
O3_Vic             115.900612
O3_Manlleu         104.590998
O3_Berga           123.999522
O3_Tarragona       176.457298
O3_Alcover         146.968258
O3_Constanti        96.825094
O3_Santa-Pau       222.277266
O3_Vilasec

In [None]:
error_variance.mean()

129.26269105531375

In [None]:
mean_squared_error(X_dataset,X_pred_svd)

129.26269105531375

In [None]:
error_variance.mean()

129.26269105531375

In [None]:
error_variance

O3_Ciutadella      136.810293
O3_Eixample        105.699414
O3_Gracia          103.331650
O3_Sant-Adria      108.010527
O3_Vall-Hebron     135.387428
O3_Palau-Reial     105.653723
O3_Fabra            90.858704
O3_Badalona        109.743171
O3_Montcada         75.395332
O3_El-Prat         113.166207
O3_Sant-Cugat       78.942156
O3_Sant-Vicenç      91.601650
O3_Viladecans      129.792243
O3_Rubi            109.752019
O3_Gava            166.587472
O3_Sabadell        117.804202
O3_Terrassa        145.092515
O3_Granollers      131.306719
O3_Mataro          149.289994
O3_Santa-Maria     126.667570
O3_Vilafranca      108.686446
O3_Sant-Celoni     133.834764
O3_Vilanova        117.115638
O3_Manresa         114.698217
O3_Tona            123.372867
O3_Igualada        124.379275
O3_Vic             115.900612
O3_Manlleu         104.590998
O3_Berga           123.999522
O3_Tarragona       176.457298
O3_Alcover         146.968258
O3_Constanti        96.825094
O3_Santa-Pau       222.277266
O3_Vilasec

In [None]:
s = 11

In [None]:
Psi = U[:,:s]

In [None]:
Psi.shape

(48, 11)

In [None]:
snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix_centered) + snapshots_matrix_train.mean(axis=1)[:,None]
X_pred_svd = pd.DataFrame(snapshots_matrix_pred_svd.T)
X_pred_svd.columns = X_dataset.columns
X_pred_svd.index = X_dataset.index

In [None]:
error = X_dataset - X_pred_svd
mse = pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)

In [None]:
error_variance = error.var(axis=0,ddof=0) # estiamted coordinate error variance

In [None]:
error

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,-2.244697,-1.882913,-4.704154,3.444375,3.359821,-2.665101,3.242487,0.484321,-0.184410,0.579165,...,2.877926,1.758549,1.112524,0.138081,-2.456695,-5.931652,-1.843494,-7.485669,3.927792,1.630258
2020-09-24 14:00:00,-3.907870,-3.545735,-1.293036,8.122316,1.599789,7.620237,11.008230,-1.342175,1.788480,9.386004,...,-0.562098,5.691470,-0.881984,4.877463,7.854157,-3.604601,-1.313830,-13.960360,2.262292,-6.211522
2020-09-24 15:00:00,-4.710526,0.669578,-2.320282,6.837584,2.552147,1.688222,8.192162,-2.433900,0.837070,4.394382,...,2.875097,-1.016786,-1.703290,6.875131,1.373554,-8.334972,-1.027988,-9.891944,-2.513955,-11.582961
2020-09-24 16:00:00,-2.419019,-1.340422,-2.148953,3.870203,7.259985,-0.193979,3.357574,-3.913399,2.710948,0.193339,...,-3.389905,0.572424,-8.365602,-1.639846,3.686353,-2.847270,4.728050,-3.213755,-8.113705,-8.737648
2020-09-24 17:00:00,-3.141120,0.299095,0.744572,1.443809,4.194356,0.340086,-4.706525,-6.908505,1.163660,2.783829,...,0.070105,0.618922,-3.285017,2.863482,0.393866,2.214081,1.001500,-4.747109,-3.605376,-10.414835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,-9.878063,6.801414,-4.464166,5.012281,6.753188,3.804650,9.085192,-5.645617,-6.914880,9.709069,...,-6.430799,11.698444,-14.146598,-1.284833,6.133137,-9.225711,5.446827,-5.440483,3.828897,-0.632372
2022-05-07 09:00:00,6.301471,2.054927,-2.322215,1.608717,-0.284366,1.085384,4.951061,-10.520799,3.753427,-6.130908,...,18.603900,6.342861,-4.358549,-2.129222,3.643346,-8.395900,-8.773820,-4.793019,1.113226,-2.731802
2022-05-07 10:00:00,2.090478,5.024057,-7.463032,0.179904,-2.971150,2.454355,5.898819,-10.782641,8.717399,-6.661137,...,8.676965,3.515473,-2.610092,2.470307,-0.871961,-6.927247,-2.023071,-0.740100,-2.239868,-6.666512
2022-05-07 11:00:00,-6.740121,7.766915,-4.259669,-0.015081,-4.738894,3.458744,1.550553,-1.884960,9.909515,-3.814944,...,3.120072,3.994498,3.861678,3.030031,-0.379352,-7.913707,0.381691,1.038051,-4.919481,-6.781327


In [None]:
error_variance.mean()

72.60122908537252

In [None]:
error_variance

O3_Ciutadella       82.976223
O3_Eixample         54.970461
O3_Gracia           53.588882
O3_Sant-Adria       54.175549
O3_Vall-Hebron      91.882776
O3_Palau-Reial      71.737552
O3_Fabra            76.569617
O3_Badalona         61.266468
O3_Montcada         57.690186
O3_El-Prat          71.426429
O3_Sant-Cugat       50.456398
O3_Sant-Vicenç      62.324816
O3_Viladecans       40.965682
O3_Rubi             76.320968
O3_Gava             46.737490
O3_Sabadell         78.864119
O3_Terrassa         66.289464
O3_Granollers       80.348671
O3_Mataro           84.696759
O3_Santa-Maria      55.682414
O3_Vilafranca       83.938459
O3_Sant-Celoni      63.290960
O3_Vilanova         89.731777
O3_Manresa          69.994772
O3_Tona             81.629855
O3_Igualada         75.077782
O3_Vic              68.908674
O3_Manlleu          73.059723
O3_Berga            74.325562
O3_Tarragona        69.404525
O3_Alcover          93.981805
O3_Constanti        58.770707
O3_Santa-Pau        83.701654
O3_Vilasec

In [None]:
s

11

In [None]:
s = sparsity_gd

In [None]:
s

4

In [None]:
Psi = U[:,:s]

In [None]:
snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix_centered) + snapshots_matrix_train.mean(axis=1)[:,None]
X_pred_svd = pd.DataFrame(snapshots_matrix_pred_svd.T)
X_pred_svd.columns = X_dataset.columns
X_pred_svd.index = X_dataset.index

In [None]:
X_pred_svd

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,70.866437,58.409798,62.473207,77.627921,74.757729,71.150907,80.120123,71.145205,66.154215,80.090989,...,74.516365,66.983223,71.814752,72.241113,60.268576,88.018773,61.838067,71.795275,67.495071,78.318920
2020-09-24 14:00:00,84.982714,71.119687,76.128871,93.485730,88.590639,84.694783,88.989101,85.948961,77.999300,93.422692,...,69.970465,64.774579,70.464920,68.566213,60.608554,88.206930,49.829929,69.851608,65.047861,76.646234
2020-09-24 15:00:00,81.471807,68.965272,74.292698,90.048947,86.856936,82.808698,92.851191,83.796921,75.583677,88.674215,...,65.560064,59.736725,67.471503,67.285633,56.748833,90.610905,44.241920,67.024833,58.891582,74.824216
2020-09-24 16:00:00,66.510772,54.125734,58.721507,75.174481,71.613445,66.920616,81.160294,68.401965,65.263336,75.775845,...,71.468413,58.014186,64.455232,67.621795,53.158964,87.725604,54.625461,63.736575,56.740121,73.558630
2020-09-24 17:00:00,53.865016,42.196646,46.026909,61.898766,59.115531,54.195168,72.701391,55.441214,55.918471,64.466391,...,76.881743,59.103607,64.402434,70.158482,52.246215,86.978204,65.451471,63.891064,57.395682,74.219895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,43.862131,43.491565,46.459728,37.397052,54.289243,56.583174,86.196869,43.347882,29.204977,42.550326,...,49.032601,63.733287,78.462860,80.325027,54.247066,100.643134,56.595726,83.410359,69.749791,84.051336
2022-05-07 09:00:00,51.228188,46.247898,49.040786,50.018811,59.300785,58.946045,82.048454,51.289027,42.625162,55.412800,...,67.103922,72.227876,80.843619,82.994985,62.173047,97.834792,70.545246,84.279777,74.553197,85.510231
2022-05-07 10:00:00,55.363842,46.786815,49.590080,59.190264,62.189404,59.209879,79.627329,56.703683,53.314360,64.099979,...,81.535190,76.715187,80.780148,84.371824,66.791347,96.183347,80.414604,82.824109,74.974742,85.471011
2022-05-07 11:00:00,62.772088,52.659962,55.819165,70.203126,70.319084,65.570716,86.748066,66.427152,64.585577,73.312271,...,89.247820,80.961217,82.913596,87.280768,72.233830,99.642041,83.402676,84.487772,74.950099,86.788222


In [None]:
error = X_dataset - X_pred_svd

In [None]:
mse = pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)

In [None]:
error_variance = error.var(axis=0,ddof=0) # estiamted coordinate error variance

In [None]:
mse.mean()

4    129.262691
dtype: float64

In [None]:
mean_squared_error(X_dataset,X_pred_svd)

129.26269105531375

In [None]:
error_variance.mean()

129.26269105531375

In [None]:
error_variance

O3_Ciutadella      136.810293
O3_Eixample        105.699414
O3_Gracia          103.331650
O3_Sant-Adria      108.010527
O3_Vall-Hebron     135.387428
O3_Palau-Reial     105.653723
O3_Fabra            90.858704
O3_Badalona        109.743171
O3_Montcada         75.395332
O3_El-Prat         113.166207
O3_Sant-Cugat       78.942156
O3_Sant-Vicenç      91.601650
O3_Viladecans      129.792243
O3_Rubi            109.752019
O3_Gava            166.587472
O3_Sabadell        117.804202
O3_Terrassa        145.092515
O3_Granollers      131.306719
O3_Mataro          149.289994
O3_Santa-Maria     126.667570
O3_Vilafranca      108.686446
O3_Sant-Celoni     133.834764
O3_Vilanova        117.115638
O3_Manresa         114.698217
O3_Tona            123.372867
O3_Igualada        124.379275
O3_Vic             115.900612
O3_Manlleu         104.590998
O3_Berga           123.999522
O3_Tarragona       176.457298
O3_Alcover         146.968258
O3_Constanti        96.825094
O3_Santa-Pau       222.277266
O3_Vilasec

In [None]:
error

Unnamed: 0,O3_Ciutadella,O3_Eixample,O3_Gracia,O3_Sant-Adria,O3_Vall-Hebron,O3_Palau-Reial,O3_Fabra,O3_Badalona,O3_Montcada,O3_El-Prat,...,O3_Bellver,O3_Juneda,O3_Els-Guiamets,O3_Agullana,O3_Lleida,O3_Montsec,O3_Sort,O3_Gandesa,O3_Amposta,O3_La-Senla
2020-09-24 13:00:00,-2.866437,-1.409798,-4.473207,-1.627921,5.242271,0.849093,2.879877,-3.145205,-2.154215,0.909011,...,7.483635,-0.983223,-6.814752,5.758887,-2.268576,-10.018773,15.161933,-12.795275,6.504929,0.681080
2020-09-24 14:00:00,1.017286,0.880313,2.871129,5.514270,6.409361,14.305217,12.010899,-3.948961,-0.999300,13.577308,...,10.029535,0.225421,-10.464920,14.433787,4.391446,-6.206930,27.170071,-20.851608,1.952139,-7.646234
2020-09-24 15:00:00,5.528193,12.034728,8.707302,6.951053,8.143064,12.191302,8.148809,0.203079,-3.583677,3.325785,...,16.439936,-9.736725,-14.471503,16.714367,-3.748833,-9.610905,33.758080,-19.024833,-4.891582,-16.824216
2020-09-24 16:00:00,5.489228,7.874266,6.278493,3.825519,8.386555,6.079384,3.839706,-2.401965,-0.263336,-1.775845,...,5.531587,-3.014186,-15.455232,1.378205,3.841036,-2.725604,24.374539,-8.736575,-11.740121,-15.558630
2020-09-24 17:00:00,2.134984,7.803354,6.973091,-0.898766,3.884469,4.804832,-4.701391,-8.441214,-0.918471,0.533609,...,6.118257,0.896393,-7.402434,2.841518,4.753785,1.021796,13.548529,-7.891064,-7.395682,-17.219895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-07 08:00:00,3.137869,21.508435,10.540272,10.602948,20.710757,19.416826,1.803131,3.652118,-8.204977,6.449674,...,8.967399,2.266713,-26.462860,19.674973,-3.247066,-15.643134,56.404274,-9.410359,8.250209,3.948664
2022-05-07 09:00:00,17.771812,11.752102,7.959214,5.981189,13.699215,14.053955,1.951546,-6.289027,2.374838,-0.412800,...,34.896078,-3.227876,-14.843619,19.005015,-8.173047,-12.834792,39.454754,-9.279777,3.446803,3.489769
2022-05-07 10:00:00,5.636158,6.213185,-4.590080,2.809736,6.810596,9.790121,3.372671,-6.703683,7.685640,-3.099979,...,17.464810,-5.715187,-11.780148,19.628176,-12.791347,-11.183347,30.585396,-4.824109,5.025258,1.528989
2022-05-07 11:00:00,-7.772088,5.340038,-4.819165,-0.203126,4.680916,9.429284,-0.748066,0.572848,8.414423,-1.312271,...,8.752180,-6.961217,-7.913596,19.719232,-13.233830,-12.642041,28.597324,-5.487772,4.049901,2.211778


In [None]:
noise

0.038416000000000006

In [None]:
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)

In [None]:
print(f'Training snapshots matrix has dimensions {snapshots_matrix_train_centered.shape}.\nLeft singular vectors matrix has dimensions {U.shape}\nRight singular vectors matrix has dimensions {Vt.shape}\nNumber of singular values: {sing_vals.shape}')

Training snapshots matrix has dimensions (48, 5646).
Left singular vectors matrix has dimensions (48, 48)
Right singular vectors matrix has dimensions (48, 5646)
Number of singular values: (48,)


In [None]:
ppb

1.96

In [None]:
snapshots_matrix = snapshots_matrix_train

In [None]:
beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]

In [None]:
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)
sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

In [None]:
sparsity_gd

11

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sparsity_gd = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sparsity_gd

4

In [None]:
lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))

4.132860474217467

In [None]:
sparsity_gd

4

In [None]:
s = sparsity_gd

In [None]:
Psi = U[:,:s]

In [None]:
Psi.shape

(48, 4)

In [None]:
snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix)

In [None]:
snapshots_matrix_pred_svd

array([[69.01132897, 82.884453  , 79.7480174 , ..., 55.17747378,
        62.71002067, 71.727819  ],
       [59.20297716, 72.20191058, 69.94430594, ..., 47.20869853,
        53.03648583, 60.23031515],
       [62.59741532, 76.67150141, 74.890811  , ..., 50.43850599,
        56.62470855, 64.07965521],
       ...,
       [74.40585135, 73.56400917, 70.18246547, ..., 85.09588074,
        86.33331882, 88.29781107],
       [67.40441849, 65.11609944, 58.92444723, ..., 75.13224537,
        75.02772693, 76.69202323],
       [75.43271703, 75.00219846, 73.75359732, ..., 89.36847633,
        90.39872174, 91.48912485]])

In [None]:
snapshots_matrix

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
error = snapshots_matrix - snapshots_matrix_pred_svd

In [None]:
error

array([[ -1.01132897,   3.115547  ,   7.2519826 , ...,   5.82252622,
         -7.71002067,  -1.727819  ],
       [ -2.20297716,  -0.20191058,  11.05569406, ...,   5.79130147,
          4.96351417, -17.23031515],
       [ -4.59741532,   2.32849859,   8.109189  , ...,  -5.43850599,
         -5.62470855, -14.07965521],
       ...,
       [-15.40585135, -24.56400917, -22.18246547, ...,  -7.09588074,
         -7.33331882,  -7.29781107],
       [  6.59558151,   1.88390056,  -4.92444723, ...,   4.86775463,
          3.97227307,   7.30797677],
       [  3.56728297,  -6.00219846, -15.75359732, ...,  -2.36847633,
         -1.39872174,  -4.48912485]])

In [None]:
error = snapshots_matrix.T - snapshots_matrix_pred_svd.T

In [None]:
error

array([[ -1.01132897,  -2.20297716,  -4.59741532, ..., -15.40585135,
          6.59558151,   3.56728297],
       [  3.115547  ,  -0.20191058,   2.32849859, ..., -24.56400917,
          1.88390056,  -6.00219846],
       [  7.2519826 ,  11.05569406,   8.109189  , ..., -22.18246547,
         -4.92444723, -15.75359732],
       ...,
       [  5.82252622,   5.79130147,  -5.43850599, ...,  -7.09588074,
          4.86775463,  -2.36847633],
       [ -7.71002067,   4.96351417,  -5.62470855, ...,  -7.33331882,
          3.97227307,  -1.39872174],
       [ -1.727819  , -17.23031515, -14.07965521, ...,  -7.29781107,
          7.30797677,  -4.48912485]])

In [None]:
error.shape

(5646, 48)

In [None]:
error_variance = error.var(axis=0,ddof=0)

In [None]:
error_variance

array([143.43921016, 106.84910679, 102.55141035, 123.46092495,
       130.73226669, 102.84767731, 106.32278838, 110.08972994,
        76.95337895, 139.40420463,  79.17654125,  88.05234122,
       129.38006734, 109.56494277, 165.13970363, 125.94244487,
       160.82010999, 130.42176135, 157.90636898, 128.31179987,
       111.51854789, 139.75191287, 116.99214538, 123.6991124 ,
       123.48550226, 119.74517756, 113.73277129, 101.29284401,
       145.27637472, 176.69462712, 143.75302954, 100.60192352,
       228.99288708, 100.78061288, 116.99621182, 161.12088023,
       178.90453982, 105.12145255, 154.71886681, 136.24205874,
       144.9612673 , 177.92844186, 164.99459017, 109.55623876,
       281.11285673, 138.73308314, 163.09435882, 104.69110966])

In [None]:
(0.1*ppb)**2

0.038416000000000006

In [None]:
mean_squared_error(snapshots_matrix,snapshots_matrix_pred_svd)

133.45467963681014

In [None]:
error_variance.mean()

133.37208759082253

In [None]:
pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)

Unnamed: 0,4
2020-09-24 13:00:00,40.859863
2020-09-24 14:00:00,83.110628
2020-09-24 15:00:00,116.275952
2020-09-24 16:00:00,65.849279
2020-09-24 17:00:00,45.622997
...,...
2022-05-07 08:00:00,197.668997
2022-05-07 09:00:00,173.715170
2022-05-07 10:00:00,120.318898
2022-05-07 11:00:00,107.823840


In [None]:
mse.mean()

4    129.262691
dtype: float64

In [None]:
(error**2).mean().mean()

133.45467963681014

In [None]:
mse = pd.DataFrame((((error)**2).mean(axis=1)),columns=[s],index=X_dataset.index)

In [None]:
mse.mean()

4    133.45468
dtype: float64

In [None]:
error_variance.mean()

133.37208759082253

In [None]:
mean_squared_error(snapshots_matrix,snapshots_matrix_pred_svd)

133.45467963681014

In [None]:
snapshots_matrix_pred_svd

array([[69.01132897, 82.884453  , 79.7480174 , ..., 55.17747378,
        62.71002067, 71.727819  ],
       [59.20297716, 72.20191058, 69.94430594, ..., 47.20869853,
        53.03648583, 60.23031515],
       [62.59741532, 76.67150141, 74.890811  , ..., 50.43850599,
        56.62470855, 64.07965521],
       ...,
       [74.40585135, 73.56400917, 70.18246547, ..., 85.09588074,
        86.33331882, 88.29781107],
       [67.40441849, 65.11609944, 58.92444723, ..., 75.13224537,
        75.02772693, 76.69202323],
       [75.43271703, 75.00219846, 73.75359732, ..., 89.36847633,
        90.39872174, 91.48912485]])

In [None]:
snapshots_matrix

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
error

array([[ -1.01132897,  -2.20297716,  -4.59741532, ..., -15.40585135,
          6.59558151,   3.56728297],
       [  3.115547  ,  -0.20191058,   2.32849859, ..., -24.56400917,
          1.88390056,  -6.00219846],
       [  7.2519826 ,  11.05569406,   8.109189  , ..., -22.18246547,
         -4.92444723, -15.75359732],
       ...,
       [  5.82252622,   5.79130147,  -5.43850599, ...,  -7.09588074,
          4.86775463,  -2.36847633],
       [ -7.71002067,   4.96351417,  -5.62470855, ...,  -7.33331882,
          3.97227307,  -1.39872174],
       [ -1.727819  , -17.23031515, -14.07965521, ...,  -7.29781107,
          7.30797677,  -4.48912485]])

In [None]:
noise

0.038416000000000006

In [None]:
noise = 0.1*ppb

In [None]:
noise

0.196

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sparsity_gd = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sparsity_gd

21

In [None]:
s = sparsity_gd

In [None]:
s

21

In [None]:
Psi = U[:,:s]

In [None]:
snapshots_matrix_pred_svd = Psi@Psi.T@snapshots_matrix

In [None]:
error = snapshots_matrix - snapshots_matrix_pred_svd

In [None]:
error

array([[  0.65633321,  -1.21270672,   1.59384434, ...,   4.76555614,
         -4.85335294,   6.70214785],
       [  0.77434484,  -2.31068092,   3.7853931 , ...,   4.91548646,
          7.62258028,  -8.25396051],
       [ -2.42059124,  -1.78375423,  -1.52574195, ...,  -7.63940424,
         -3.05646025,  -6.29245146],
       ...,
       [ -5.51046915,  -7.52511881,  -2.06792911, ...,   2.52304267,
          0.14854677,  -1.33519895],
       [  1.04363635,  -0.09519721,   0.78409632, ...,  -1.98786993,
         -3.40012907,  -3.16911468],
       [  2.33847545,  -6.968638  , -10.95297524, ...,  -5.11850205,
         -4.6958373 ,  -9.10329826]])

In [None]:
error.shape

(48, 5646)

In [None]:
error_variance = error.var(axis=1,ddof=0)

In [None]:
error_variance

array([35.99309541, 36.28424399, 35.25512203, 34.56109688, 36.13942538,
       41.68515656, 35.88508161, 34.97582425, 44.55494429, 60.71285215,
       38.20458393, 53.41484332, 32.85820586, 59.56999572, 31.71437492,
       34.13399346, 33.314158  , 63.49705312, 35.1842629 , 33.60756005,
       52.02723209, 27.38179554, 58.68908295, 49.86937895, 50.53755   ,
       55.72186722, 32.19632297, 47.76500596, 45.76810134, 21.29865398,
       32.22159818, 46.43672114, 10.81616223, 30.27380748, 26.88749376,
       56.8317641 , 37.20694616, 41.76473683, 23.90597617, 46.33978395,
       27.74792791, 13.98017205, 44.4513056 , 45.38144336, 11.14573773,
       32.07244493,  9.71469122, 48.97586558])

In [None]:
error_variance.shape

(48,)

In [None]:
error_variance.mean()

38.31157173334872

In [None]:
mean_squared_error(snapshots_matrix,snapshots_matrix_pred_svd)

38.316134209225865

In [None]:
(error**2).mean().mean()

38.316134209225865

In [None]:
error_variance

array([35.99309541, 36.28424399, 35.25512203, 34.56109688, 36.13942538,
       41.68515656, 35.88508161, 34.97582425, 44.55494429, 60.71285215,
       38.20458393, 53.41484332, 32.85820586, 59.56999572, 31.71437492,
       34.13399346, 33.314158  , 63.49705312, 35.1842629 , 33.60756005,
       52.02723209, 27.38179554, 58.68908295, 49.86937895, 50.53755   ,
       55.72186722, 32.19632297, 47.76500596, 45.76810134, 21.29865398,
       32.22159818, 46.43672114, 10.81616223, 30.27380748, 26.88749376,
       56.8317641 , 37.20694616, 41.76473683, 23.90597617, 46.33978395,
       27.74792791, 13.98017205, 44.4513056 , 45.38144336, 11.14573773,
       32.07244493,  9.71469122, 48.97586558])

In [None]:
error

array([[  0.65633321,  -1.21270672,   1.59384434, ...,   4.76555614,
         -4.85335294,   6.70214785],
       [  0.77434484,  -2.31068092,   3.7853931 , ...,   4.91548646,
          7.62258028,  -8.25396051],
       [ -2.42059124,  -1.78375423,  -1.52574195, ...,  -7.63940424,
         -3.05646025,  -6.29245146],
       ...,
       [ -5.51046915,  -7.52511881,  -2.06792911, ...,   2.52304267,
          0.14854677,  -1.33519895],
       [  1.04363635,  -0.09519721,   0.78409632, ...,  -1.98786993,
         -3.40012907,  -3.16911468],
       [  2.33847545,  -6.968638  , -10.95297524, ...,  -5.11850205,
         -4.6958373 ,  -9.10329826]])

In [None]:
error/(0.1*ppb)

array([[  3.34863883,  -6.18727919,   8.13185888, ...,  24.31406194,
        -24.76200479,  34.1946319 ],
       [  3.95073897, -11.78918838,  19.3132301 , ...,  25.07901253,
         38.89071574, -42.11204344],
       [-12.34995531,  -9.10078686,  -7.78439772, ..., -38.97655227,
        -15.59418493, -32.10434419],
       ...,
       [-28.1146385 , -38.3934633 , -10.55065871, ...,  12.87266669,
          0.7578917 ,  -6.81223954],
       [  5.32467526,  -0.48570006,   4.00049141, ..., -10.1421935 ,
        -17.34759728, -16.16895243],
       [ 11.9309972 , -35.55427552, -55.88252673, ..., -26.1148064 ,
        -23.95835358, -46.44539929]])

In [None]:
4/np.sqrt(3)

2.3094010767585034

In [None]:
np.sqrt(2 * (beta + 1) + (8 * beta) / 
           (beta + 1 + np.sqrt(beta**2 + 14 * beta + 1)))

1.4317532054115514

In [None]:
np.sqrt(t1+t2)

1.4317532054115514

In [None]:
Y

NameError: name 'Y' is not defined

In [None]:
X

NameError: name 'X' is not defined

In [None]:
noise = (0.1*ppb)**2

In [None]:
noise

0.038416000000000006

In [None]:
X = snapshots_matrix

In [None]:
X

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
seed = 0

In [None]:
rng = np.random.default_rng(seed=seed)

In [None]:
Z = rng.standard_normal(size=(X.shape))

In [None]:
Z.shape

(48, 5646)

In [None]:
Y = X + noise*Z

In [None]:
Y

array([[68.00483005, 85.99492506, 87.02460248, ..., 60.97764335,
        55.02380348, 69.98156397],
       [57.07246084, 71.97593312, 81.01566772, ..., 53.01032367,
        58.01969676, 42.99051172],
       [58.02306502, 78.98844262, 82.97287548, ..., 44.95675587,
        50.97091464, 50.04297088],
       ...,
       [59.0034212 , 49.05754316, 48.04807878, ..., 77.97984302,
        79.00890451, 80.99593278],
       [74.08964297, 66.98730125, 54.0092418 , ..., 79.96540142,
        78.99635367, 84.00361822],
       [79.04475706, 69.01504135, 58.00615747, ..., 87.05978981,
        88.97673554, 87.01085347]])

In [None]:
X

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
U,sing_vals,Vt = np.linalg.svd(Y,full_matrices=False)

In [None]:
sing_vals

array([28456.08422574,  5818.11969194,  2846.75681197,  2518.00811071,
        1929.51543968,  1639.2520449 ,  1504.55647997,  1478.77298377,
        1382.52982957,  1328.90242296,  1303.19401293,  1152.78647175,
        1090.6541967 ,  1057.61104055,  1000.68019645,   966.01594232,
         956.03686281,   909.47687835,   898.09414529,   853.80184202,
         835.90317863,   827.65190311,   798.72940888,   772.72052583,
         752.17723961,   729.67214184,   710.95852432,   692.64207537,
         681.35799875,   668.95243468,   657.3877258 ,   639.85620878,
         612.54291417,   607.89603459,   592.1793854 ,   584.68219269,
         565.25416971,   558.0438926 ,   556.53625765,   540.60912642,
         537.16196904,   530.15268188,   510.24697366,   492.79105122,
         486.07490509,   478.77907859,   466.36529552,   449.10214354])

In [None]:
beta = Y.shape[0]/Y.shape[1]

In [None]:
beta

0.008501594048884165

In [None]:
beta**-1

117.625

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )

In [None]:
lambda_beta = np.sqrt(t1+t2)

In [None]:
int(lambda_beta*noise*np.sqrt(max(Y.shape)))

4

In [None]:
s = int(lambda_beta*noise*np.sqrt(max(Y.shape)))

In [None]:
Psi = U[:,:s]

In [None]:
Psi

array([[-0.12779532, -0.1053572 , -0.19691878,  0.02048683],
       [-0.11484776, -0.05599741, -0.21254957,  0.03509478],
       [-0.12399808, -0.03567035, -0.22685474,  0.06316345],
       [-0.12522008, -0.18968499, -0.12294503,  0.06565924],
       [-0.14615522, -0.03286928, -0.17854881,  0.09675215],
       [-0.14602398, -0.00137518, -0.22897749,  0.06816418],
       [-0.1978261 ,  0.273012  , -0.09326096,  0.27399306],
       [-0.12657147, -0.12414891, -0.14699655,  0.08594305],
       [-0.10109823, -0.19959292,  0.00102243,  0.07080602],
       [-0.13346497, -0.16864181, -0.11517037, -0.00703275],
       [-0.11756268, -0.18502151, -0.03351508,  0.02521298],
       [-0.10390168, -0.18325755, -0.02008614,  0.02393168],
       [-0.15609778, -0.0273944 , -0.17336385,  0.02695734],
       [-0.13297733, -0.15465628, -0.06272307,  0.02899621],
       [-0.15132204, -0.0566191 , -0.1259925 ,  0.03481333],
       [-0.11830414, -0.07589078, -0.07829745,  0.08098866],
       [-0.12443454, -0.

In [None]:
Y_pred_svd = Psi@Psi.T@Y

In [None]:
Y_pred_svd

array([[69.01959413, 82.87573159, 79.75769623, ..., 55.17616874,
        62.70230601, 71.72558971],
       [59.21777086, 72.19699244, 69.95479889, ..., 47.2071338 ,
        53.02811647, 60.2316536 ],
       [62.61488121, 76.66676916, 74.8998254 , ..., 50.43709284,
        56.61504555, 64.08223573],
       ...,
       [74.39842217, 73.55682922, 70.20838013, ..., 85.08347699,
        86.32318016, 88.28766508],
       [67.39030589, 65.10270784, 58.95728255, ..., 75.11827746,
        75.02030814, 76.67693708],
       [75.4318385 , 74.99588858, 73.77166411, ..., 89.35918286,
        90.38783542, 91.48452659]])

In [None]:
Y

array([[68.00483005, 85.99492506, 87.02460248, ..., 60.97764335,
        55.02380348, 69.98156397],
       [57.07246084, 71.97593312, 81.01566772, ..., 53.01032367,
        58.01969676, 42.99051172],
       [58.02306502, 78.98844262, 82.97287548, ..., 44.95675587,
        50.97091464, 50.04297088],
       ...,
       [59.0034212 , 49.05754316, 48.04807878, ..., 77.97984302,
        79.00890451, 80.99593278],
       [74.08964297, 66.98730125, 54.0092418 , ..., 79.96540142,
        78.99635367, 84.00361822],
       [79.04475706, 69.01504135, 58.00615747, ..., 87.05978981,
        88.97673554, 87.01085347]])

In [None]:
X

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
error = Y_pred_svd - Y

In [None]:
error

array([[  1.01476408,  -3.11919347,  -7.26690624, ...,  -5.80147461,
          7.67850253,   1.74402574],
       [  2.14531003,   0.22105932, -11.06086883, ...,  -5.80318987,
         -4.9915803 ,  17.24114188],
       [  4.59181619,  -2.32167346,  -8.07305008, ...,   5.48033697,
          5.64413091,  14.03926485],
       ...,
       [ 15.39500097,  24.49928606,  22.16030135, ...,   7.10363397,
          7.31427565,   7.2917323 ],
       [ -6.69933707,  -1.8845934 ,   4.94804075, ...,  -4.84712395,
         -3.97604552,  -7.32668114],
       [ -3.61291855,   5.98084724,  15.76550664, ...,   2.29939305,
          1.41109988,   4.47367311]])

In [None]:
(error**2).mean().mean()

133.45648112304707

In [None]:
error.var(axis=1,ddof=0)

array([143.44646672, 106.84644884, 102.559159  , 123.47625806,
       130.73049021, 102.83861624, 106.33767165, 110.09183781,
        76.95649266, 139.40248606,  79.18208278,  88.0775025 ,
       129.3761515 , 109.57266665, 165.14882486, 125.92518784,
       160.81978597, 130.44063475, 157.91346168, 128.3095506 ,
       111.49654977, 139.75592893, 116.99943214, 123.68949916,
       123.47283314, 119.73120829, 113.72644841, 101.30080584,
       145.27360935, 176.70749334, 143.75814694, 100.5958027 ,
       228.98835245, 100.78985134, 116.99495868, 161.11605013,
       178.89650894, 105.1162857 , 154.71349408, 136.2508412 ,
       144.96975337, 177.94199287, 165.01000741, 109.56050883,
       281.12787154, 138.72828263, 163.08380815, 104.69912456])

In [None]:
error.var(axis=1,ddof=0).shape

(48,)

In [None]:
error.var(axis=1,ddof=0).shape.mean()

AttributeError: 'tuple' object has no attribute 'mean'

In [None]:
error.var(axis=1,ddof=0).mean()

133.37390054725938

In [None]:
(error**2).mean().mean()

133.45648112304707

In [None]:
mean_squared_error(Y_pred_svd,Y)

133.4564811230471

In [None]:
mean_squared_error(Y_pred_svd,X)

133.4548029963472

In [None]:
error = Y_pred_svd - X

In [None]:
error

array([[  1.01959413,  -3.12426841,  -7.24230377, ...,  -5.82383126,
          7.70230601,   1.72558971],
       [  2.21777086,   0.19699244, -11.04520111, ...,  -5.7928662 ,
         -4.97188353,  17.2316536 ],
       [  4.61488121,  -2.33323084,  -8.1001746 , ...,   5.43709284,
          5.61504555,  14.08223573],
       ...,
       [ 15.39842217,  24.55682922,  22.20838013, ...,   7.08347699,
          7.32318016,   7.28766508],
       [ -6.60969411,  -1.89729216,   4.95728255, ...,  -4.88172254,
         -3.97969186,  -7.32306292],
       [ -3.5681615 ,   5.99588858,  15.77166411, ...,   2.35918286,
          1.38783542,   4.48452659]])

In [None]:
(error**2).mean().mean()

133.4548029963472

In [None]:
(error**2).mean().mean()

133.4548029963472

In [None]:
error = Y - X

In [None]:
error

array([[ 0.00483005, -0.00507494,  0.02460248, ..., -0.02235665,
         0.02380348, -0.01843603],
       [ 0.07246084, -0.02406688,  0.01566772, ...,  0.01032367,
         0.01969676, -0.00948828],
       [ 0.02306502, -0.01155738, -0.02712452, ..., -0.04324413,
        -0.02908536,  0.04297088],
       ...,
       [ 0.0034212 ,  0.05754316,  0.04807878, ..., -0.02015698,
         0.00890451, -0.00406722],
       [ 0.08964297, -0.01269875,  0.0092418 , ..., -0.03459858,
        -0.00364633,  0.00361822],
       [ 0.04475706,  0.01504135,  0.00615747, ...,  0.05978981,
        -0.02326446,  0.01085347]])

In [None]:
(error**2).mean().mean()

0.001479045256667723

In [None]:
noise

0.038416000000000006

In [None]:
noise**2

0.0014757890560000003

In [None]:
error = Y - X

In [None]:
(error**2).mean().mean()

0.001479045256667723

In [None]:
noise

0.038416000000000006

In [None]:
np.sqrt(noise)

0.196

In [None]:
noise**2

0.0014757890560000003

In [None]:
noise

0.038416000000000006

In [None]:
noise**2

0.0014757890560000003

In [None]:
(error**2).mean().mean()

0.001479045256667723

In [None]:
error.var(axis=1,ddof=0)

array([0.00146213, 0.00148302, 0.00144829, 0.00147769, 0.00143569,
       0.00152344, 0.00153218, 0.00149053, 0.00145673, 0.00144802,
       0.0014578 , 0.00149093, 0.00148348, 0.00147501, 0.00147068,
       0.00151972, 0.00144253, 0.00146796, 0.00147154, 0.00148238,
       0.00152089, 0.00150464, 0.00147769, 0.00145574, 0.00148002,
       0.00147201, 0.00152079, 0.00149394, 0.00153681, 0.0014728 ,
       0.00145525, 0.00148612, 0.00147858, 0.00147385, 0.00142729,
       0.00148077, 0.00146921, 0.00148819, 0.00147635, 0.00149012,
       0.00146252, 0.00147783, 0.00147538, 0.00145839, 0.00146383,
       0.00149629, 0.00146102, 0.00150478])

In [None]:
error.var(axis=1,ddof=0).mean()

0.0014787678793103463

In [None]:
Y

array([[68.00483005, 85.99492506, 87.02460248, ..., 60.97764335,
        55.02380348, 69.98156397],
       [57.07246084, 71.97593312, 81.01566772, ..., 53.01032367,
        58.01969676, 42.99051172],
       [58.02306502, 78.98844262, 82.97287548, ..., 44.95675587,
        50.97091464, 50.04297088],
       ...,
       [59.0034212 , 49.05754316, 48.04807878, ..., 77.97984302,
        79.00890451, 80.99593278],
       [74.08964297, 66.98730125, 54.0092418 , ..., 79.96540142,
        78.99635367, 84.00361822],
       [79.04475706, 69.01504135, 58.00615747, ..., 87.05978981,
        88.97673554, 87.01085347]])

In [None]:
Y_pred_svd

array([[69.01959413, 82.87573159, 79.75769623, ..., 55.17616874,
        62.70230601, 71.72558971],
       [59.21777086, 72.19699244, 69.95479889, ..., 47.2071338 ,
        53.02811647, 60.2316536 ],
       [62.61488121, 76.66676916, 74.8998254 , ..., 50.43709284,
        56.61504555, 64.08223573],
       ...,
       [74.39842217, 73.55682922, 70.20838013, ..., 85.08347699,
        86.32318016, 88.28766508],
       [67.39030589, 65.10270784, 58.95728255, ..., 75.11827746,
        75.02030814, 76.67693708],
       [75.4318385 , 74.99588858, 73.77166411, ..., 89.35918286,
        90.38783542, 91.48452659]])

In [None]:
Y

array([[68.00483005, 85.99492506, 87.02460248, ..., 60.97764335,
        55.02380348, 69.98156397],
       [57.07246084, 71.97593312, 81.01566772, ..., 53.01032367,
        58.01969676, 42.99051172],
       [58.02306502, 78.98844262, 82.97287548, ..., 44.95675587,
        50.97091464, 50.04297088],
       ...,
       [59.0034212 , 49.05754316, 48.04807878, ..., 77.97984302,
        79.00890451, 80.99593278],
       [74.08964297, 66.98730125, 54.0092418 , ..., 79.96540142,
        78.99635367, 84.00361822],
       [79.04475706, 69.01504135, 58.00615747, ..., 87.05978981,
        88.97673554, 87.01085347]])

In [None]:
error = Y-Y_pred_svd

In [None]:
error

array([[ -1.01476408,   3.11919347,   7.26690624, ...,   5.80147461,
         -7.67850253,  -1.74402574],
       [ -2.14531003,  -0.22105932,  11.06086883, ...,   5.80318987,
          4.9915803 , -17.24114188],
       [ -4.59181619,   2.32167346,   8.07305008, ...,  -5.48033697,
         -5.64413091, -14.03926485],
       ...,
       [-15.39500097, -24.49928606, -22.16030135, ...,  -7.10363397,
         -7.31427565,  -7.2917323 ],
       [  6.69933707,   1.8845934 ,  -4.94804075, ...,   4.84712395,
          3.97604552,   7.32668114],
       [  3.61291855,  -5.98084724, -15.76550664, ...,  -2.29939305,
         -1.41109988,  -4.47367311]])

In [None]:
error.var(axis=1,ddof=0)

array([143.44646672, 106.84644884, 102.559159  , 123.47625806,
       130.73049021, 102.83861624, 106.33767165, 110.09183781,
        76.95649266, 139.40248606,  79.18208278,  88.0775025 ,
       129.3761515 , 109.57266665, 165.14882486, 125.92518784,
       160.81978597, 130.44063475, 157.91346168, 128.3095506 ,
       111.49654977, 139.75592893, 116.99943214, 123.68949916,
       123.47283314, 119.73120829, 113.72644841, 101.30080584,
       145.27360935, 176.70749334, 143.75814694, 100.5958027 ,
       228.98835245, 100.78985134, 116.99495868, 161.11605013,
       178.89650894, 105.1162857 , 154.71349408, 136.2508412 ,
       144.96975337, 177.94199287, 165.01000741, 109.56050883,
       281.12787154, 138.72828263, 163.08380815, 104.69912456])

In [None]:
error.var(axis=1,ddof=0).mean()

133.37390054725938

In [None]:
sing_vals

array([28456.08422574,  5818.11969194,  2846.75681197,  2518.00811071,
        1929.51543968,  1639.2520449 ,  1504.55647997,  1478.77298377,
        1382.52982957,  1328.90242296,  1303.19401293,  1152.78647175,
        1090.6541967 ,  1057.61104055,  1000.68019645,   966.01594232,
         956.03686281,   909.47687835,   898.09414529,   853.80184202,
         835.90317863,   827.65190311,   798.72940888,   772.72052583,
         752.17723961,   729.67214184,   710.95852432,   692.64207537,
         681.35799875,   668.95243468,   657.3877258 ,   639.85620878,
         612.54291417,   607.89603459,   592.1793854 ,   584.68219269,
         565.25416971,   558.0438926 ,   556.53625765,   540.60912642,
         537.16196904,   530.15268188,   510.24697366,   492.79105122,
         486.07490509,   478.77907859,   466.36529552,   449.10214354])

In [None]:
sing_vals.shape

(48,)

In [None]:
sing_vals**2

array([8.09748729e+08, 3.38505167e+07, 8.10402435e+06, 6.34036485e+06,
       3.72302983e+06, 2.68714727e+06, 2.26369020e+06, 2.18676954e+06,
       1.91138873e+06, 1.76598165e+06, 1.69831464e+06, 1.32891665e+06,
       1.18952658e+06, 1.11854111e+06, 1.00136086e+06, 9.33186801e+05,
       9.14006483e+05, 8.27148192e+05, 8.06573094e+05, 7.28977585e+05,
       6.98734124e+05, 6.85007673e+05, 6.37968669e+05, 5.97097011e+05,
       5.65770600e+05, 5.32421435e+05, 5.05462023e+05, 4.79753045e+05,
       4.64248722e+05, 4.47497360e+05, 4.32158622e+05, 4.09415968e+05,
       3.75208822e+05, 3.69537589e+05, 3.50676424e+05, 3.41853266e+05,
       3.19512276e+05, 3.11412986e+05, 3.09732606e+05, 2.92258228e+05,
       2.88542981e+05, 2.81061866e+05, 2.60351974e+05, 2.42843020e+05,
       2.36268813e+05, 2.29229406e+05, 2.17496589e+05, 2.01692735e+05])

In [None]:
(sing_vals**2).sum()

894211409.4406961

In [None]:
(sing_vals**2).sum() - (sing_vals[:s]**2).sum()

36167774.0361948

In [None]:
((sing_vals**2).sum() - (sing_vals[:s]**2).sum())/sing_vals.shape

array([753495.29242072])

In [None]:
(error**2).mean().mean()

133.45648112304707

In [None]:
error = Y_pred_svd - X

In [None]:
error

array([[  1.01959413,  -3.12426841,  -7.24230377, ...,  -5.82383126,
          7.70230601,   1.72558971],
       [  2.21777086,   0.19699244, -11.04520111, ...,  -5.7928662 ,
         -4.97188353,  17.2316536 ],
       [  4.61488121,  -2.33323084,  -8.1001746 , ...,   5.43709284,
          5.61504555,  14.08223573],
       ...,
       [ 15.39842217,  24.55682922,  22.20838013, ...,   7.08347699,
          7.32318016,   7.28766508],
       [ -6.60969411,  -1.89729216,   4.95728255, ...,  -4.88172254,
         -3.97969186,  -7.32306292],
       [ -3.5681615 ,   5.99588858,  15.77166411, ...,   2.35918286,
          1.38783542,   4.48452659]])

In [None]:
error**2

array([[1.03957219e+00, 9.76105308e+00, 5.24509638e+01, ...,
        3.39170105e+01, 5.93255179e+01, 2.97765983e+00],
       [4.91850761e+00, 3.88060197e-02, 1.21996468e+02, ...,
        3.35572988e+01, 2.47196259e+01, 2.96929886e+02],
       [2.12971286e+01, 5.44396616e+00, 6.56128286e+01, ...,
        2.95619786e+01, 3.15287366e+01, 1.98309363e+02],
       ...,
       [2.37111405e+02, 6.03037861e+02, 4.93212148e+02, ...,
        5.01756463e+01, 5.36289676e+01, 5.31100623e+01],
       [4.36880562e+01, 3.59971753e+00, 2.45746503e+01, ...,
        2.38312149e+01, 1.58379473e+01, 5.36272505e+01],
       [1.27317765e+01, 3.59506799e+01, 2.48745389e+02, ...,
        5.56574377e+00, 1.92608715e+00, 2.01109787e+01]])

In [None]:
(error**2).mean().mean()

133.4548029963472

In [None]:
error.var(axis=1,ddof=0)

array([143.43937986, 106.84776701, 102.55028352, 123.46007877,
       130.73028646, 102.84447263, 106.32800351, 110.08845348,
        76.95226909, 139.40587514,  79.17751853,  88.05517968,
       129.38215618, 109.5671518 , 165.1413666 , 125.94100675,
       160.82304641, 130.42432125, 157.91015906, 128.3113675 ,
       111.51854745, 139.74833591, 116.9928293 , 123.70069067,
       123.48606813, 119.74879657, 113.73826466, 101.2956767 ,
       145.27893913, 176.67997998, 143.75375408, 100.60272918,
       228.99422873, 100.78606967, 116.99698551, 161.11847476,
       178.89227277, 105.12447722, 154.71081968, 136.24545115,
       144.96085598, 177.92437979, 164.997841  , 109.55984022,
       281.11599952, 138.73393321, 163.09562255, 104.68799985])

In [None]:
error.var(axis=1,ddof=0).mean()

133.37229180385296

In [None]:
mean_squared_error(Y,X)

0.0014790452566677231

In [None]:
mean_squared_error(Y_pred_svd,X)

133.4548029963472

In [None]:
int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

4

In [None]:
int(lambda_beta*np.sqrt(max(snapshots_matrix.shape)*noise))

21

In [None]:
s = 21

In [None]:
Psi = U[:,:s]

In [None]:
lambda_beta

1.4317532054115514

In [None]:
lambda_beta

1.4317532054115514

In [None]:
s

21

In [None]:
Psi.shape

(48, 21)

In [None]:
Y_pred_svd = Psi@Psi.T@Y

In [None]:
Y_pred_svd

array([[67.36602148, 87.20009983, 85.41885107, ..., 56.20052066,
        59.85393474, 63.30240927],
       [56.25551924, 74.30454923, 77.20447221, ..., 48.07469293,
        50.3862535 , 51.25211959],
       [60.44636941, 80.77893533, 84.52339142, ..., 52.62691782,
        54.06018927, 56.29089976],
       ...,
       [64.49722838, 56.54905826, 50.11144604, ..., 75.43861998,
        78.82296614, 82.32087729],
       [73.03224758, 67.08815759, 53.22872553, ..., 81.96761415,
        82.39892976, 87.17102085],
       [76.67904051, 75.96409827, 68.97794401, ..., 92.10629439,
        93.68916019, 96.08808299]])

In [None]:
Y

array([[68.00483005, 85.99492506, 87.02460248, ..., 60.97764335,
        55.02380348, 69.98156397],
       [57.07246084, 71.97593312, 81.01566772, ..., 53.01032367,
        58.01969676, 42.99051172],
       [58.02306502, 78.98844262, 82.97287548, ..., 44.95675587,
        50.97091464, 50.04297088],
       ...,
       [59.0034212 , 49.05754316, 48.04807878, ..., 77.97984302,
        79.00890451, 80.99593278],
       [74.08964297, 66.98730125, 54.0092418 , ..., 79.96540142,
        78.99635367, 84.00361822],
       [79.04475706, 69.01504135, 58.00615747, ..., 87.05978981,
        88.97673554, 87.01085347]])

In [None]:
mean_squared_error(Y,Y_pred_svd)

38.31798585097612

In [None]:
mean_squared_error(X,Y_pred_svd)

38.316786318668996

In [None]:
mean_squared_error(X,Y)

0.0014790452566677231

In [None]:
np.linalg.matrix_rank(Y)

48

In [None]:
np.linalg.matrix_rank(X)

48

In [None]:
mean_squared_error(X,Y)

0.0014790452566677231

In [None]:
mean_squared_error(X,Y_pred_svd)

38.316786318668996

In [None]:
noise**2

0.0014757890560000003

In [None]:
(sing_vals[s:]**2).sum()

10384480.709501334

In [None]:
(sing_vals[s:]).sum()

16500.524258728667

In [None]:
(sing_vals[s:]**2).sum()

10384480.709501334

In [None]:
(sing_vals[s:]**2).mean()

384610.39664819755

In [None]:
mean_squared_error(X,Y_pred_svd)

38.316786318668996

In [None]:
(sing_vals[s:]**2).mean().sqrt()

AttributeError: 'numpy.float64' object has no attribute 'sqrt'

In [None]:
np.sqrt((sing_vals[s:]**2).mean())

620.1696515052938

In [None]:
np.linalg.norm(error)

6013.927107176646

In [None]:
np.linalg.norm(error,ord='fro')

6013.927107176646

In [None]:
(error**2).sum()

36167319.25043406

In [None]:
np.sqrt((error**2).sum())

6013.927107176646

In [None]:
mse

Unnamed: 0,4
2020-09-24 13:00:00,40.859863
2020-09-24 14:00:00,83.110628
2020-09-24 15:00:00,116.275952
2020-09-24 16:00:00,65.849279
2020-09-24 17:00:00,45.622997
...,...
2022-05-07 08:00:00,197.668997
2022-05-07 09:00:00,173.715170
2022-05-07 10:00:00,120.318898
2022-05-07 11:00:00,107.823840


In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)

In [None]:
sparsity_gd = int(lambda_beta*np.sqrt(max(snapshots_matrix.shape)*noise))

In [None]:
sparsity_gd

21

In [None]:
noise

0.038416000000000006

In [None]:
noise = 0.1*ppb

In [None]:
noise

0.196

In [None]:
noise

0.196

In [None]:
np.linalg.matrix_rank(Y)

48

In [None]:
mean_squared_error(Y,Y_pred_svd)

38.31798585097612

In [None]:
mean_squared_error(X,Y_pred_svd)

38.316786318668996

In [None]:
mean_squared_error(X,Y)

0.0014790452566677231

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sparsity_gd = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sparsity_gd

21

In [None]:
noise = (1*ppb)

In [None]:
sparsity_gd = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))
    retu

IndentationError: unexpected indent (<ipython-input-440-7fc6c8931ec0>, line 2)

In [None]:
sparsity_gd = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sparsity_gd

210

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)

In [None]:
lambda_beta

1.4317532054115514

In [None]:
beta

0.008501594048884165

In [None]:
beta

0.008501594048884165

In [None]:
noise

1.96

In [None]:
1*ppb

1.96

In [None]:
lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))

210.8602282764013

In [None]:
sing_vals

array([28456.08422574,  5818.11969194,  2846.75681197,  2518.00811071,
        1929.51543968,  1639.2520449 ,  1504.55647997,  1478.77298377,
        1382.52982957,  1328.90242296,  1303.19401293,  1152.78647175,
        1090.6541967 ,  1057.61104055,  1000.68019645,   966.01594232,
         956.03686281,   909.47687835,   898.09414529,   853.80184202,
         835.90317863,   827.65190311,   798.72940888,   772.72052583,
         752.17723961,   729.67214184,   710.95852432,   692.64207537,
         681.35799875,   668.95243468,   657.3877258 ,   639.85620878,
         612.54291417,   607.89603459,   592.1793854 ,   584.68219269,
         565.25416971,   558.0438926 ,   556.53625765,   540.60912642,
         537.16196904,   530.15268188,   510.24697366,   492.79105122,
         486.07490509,   478.77907859,   466.36529552,   449.10214354])

In [None]:
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)
sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

In [None]:
sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

In [None]:
sparsity_gd

11

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sing_val_threshold = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sing_val_threshold

210

In [None]:
beta = Y.shape[0]/Y.shape[1]

In [None]:
beta

0.008501594048884165

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)

In [None]:
sing_val_threshold = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sing_val_threshold

210

In [None]:
snapshots_matrix.shape

(48, 5646)

In [None]:
Y.shape

(48, 5646)

In [None]:
lambda_beta*noise.np.sqrt(5646)

AttributeError: 'float' object has no attribute 'np'

In [None]:
lambda_beta*noise*np.sqrt(5646)

210.8602282764013

In [None]:
lambda_beta*(1*ppb)*np.sqrt(5646)

210.8602282764013

In [None]:
lambda_beta*((1*ppb)**2)*np.sqrt(5646)

413.28604742174656

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

47

In [None]:
sing_val_threshold

210

In [None]:
sing_vals

array([28456.08422574,  5818.11969194,  2846.75681197,  2518.00811071,
        1929.51543968,  1639.2520449 ,  1504.55647997,  1478.77298377,
        1382.52982957,  1328.90242296,  1303.19401293,  1152.78647175,
        1090.6541967 ,  1057.61104055,  1000.68019645,   966.01594232,
         956.03686281,   909.47687835,   898.09414529,   853.80184202,
         835.90317863,   827.65190311,   798.72940888,   772.72052583,
         752.17723961,   729.67214184,   710.95852432,   692.64207537,
         681.35799875,   668.95243468,   657.3877258 ,   639.85620878,
         612.54291417,   607.89603459,   592.1793854 ,   584.68219269,
         565.25416971,   558.0438926 ,   556.53625765,   540.60912642,
         537.16196904,   530.15268188,   510.24697366,   492.79105122,
         486.07490509,   478.77907859,   466.36529552,   449.10214354])

In [None]:
noise

1.96

In [None]:
ppb

1.96

In [None]:
noise = (7.5*ppb)**2

In [None]:
Y = X + noise*Z

In [None]:
Y

array([[  95.16904348,   57.45346009,  225.38893053, ...,  -64.75616199,
         188.8945732 ,  -33.70269427],
       [ 464.5921978 ,  -63.37621986,  169.13093796, ...,  111.07063995,
         168.79429644,  -10.37157683],
       [ 187.74074729,   13.98971183,  -69.57542164, ..., -198.24823055,
        -112.60513915,  291.71118672],
       ...,
       [  78.24423448,  372.68025681,  318.44314118, ...,  -35.38301754,
         129.08786429,   58.12187412],
       [ 578.24168404,   -4.43048687,  105.98514885, ..., -114.6170377 ,
          58.4893689 ,  104.35248976],
       [ 330.75844218,  153.60758373,   92.6357783 , ...,  423.31768081,
         -41.86260281,  148.05078803]])

In [None]:
mean_squared_error(Y,X)

46797.916324252146

In [None]:
np.sqrt(mean_squared_error(Y,X))

216.32826057695777

In [None]:
noise

216.08999999999997

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sing_val_threshold = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
sing_val_threshold

23247

In [None]:
sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

In [None]:
sparsity_gd

0

In [None]:
sparsity_gd

0

In [None]:
sing_val_threshold

23247

In [None]:
sing_vals

array([28456.08422574,  5818.11969194,  2846.75681197,  2518.00811071,
        1929.51543968,  1639.2520449 ,  1504.55647997,  1478.77298377,
        1382.52982957,  1328.90242296,  1303.19401293,  1152.78647175,
        1090.6541967 ,  1057.61104055,  1000.68019645,   966.01594232,
         956.03686281,   909.47687835,   898.09414529,   853.80184202,
         835.90317863,   827.65190311,   798.72940888,   772.72052583,
         752.17723961,   729.67214184,   710.95852432,   692.64207537,
         681.35799875,   668.95243468,   657.3877258 ,   639.85620878,
         612.54291417,   607.89603459,   592.1793854 ,   584.68219269,
         565.25416971,   558.0438926 ,   556.53625765,   540.60912642,
         537.16196904,   530.15268188,   510.24697366,   492.79105122,
         486.07490509,   478.77907859,   466.36529552,   449.10214354])

In [None]:
beta

0.008501594048884165

In [None]:
lambda_beta

1.4317532054115514

In [None]:
X

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
snapshots_matrix

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
np.allclose(X,snapshots_matrix)

True

In [None]:
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

11

In [None]:
U,sing_vals,Vt = np.linalg.svd(Y,full_matrices=False)

In [None]:
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

0

In [None]:
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

0

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sing_val_threshold = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

0

In [None]:
Psi = U[:,0]

In [None]:
Psi

array([-0.12034435, -0.12535607, -0.12678467, -0.11365465, -0.15428619,
       -0.14153438, -0.19612016, -0.11828612, -0.09528986, -0.13968733,
       -0.11527359, -0.11980284, -0.14063898, -0.12003775, -0.15145852,
       -0.12880495, -0.10371999, -0.12881515, -0.15774031, -0.12426352,
       -0.1480765 , -0.12214199, -0.14555795, -0.09824333, -0.15017927,
       -0.1310164 , -0.12172908, -0.10254376, -0.13397605, -0.15690622,
       -0.17340745, -0.15188567, -0.11551266, -0.1739793 , -0.15012231,
       -0.13521728, -0.1880746 , -0.18754513, -0.13749341, -0.14289501,
       -0.1787747 , -0.16580453, -0.11374979, -0.21222729, -0.11577545,
       -0.17663711, -0.16816168, -0.18393909])

In [None]:
Psi = U[:,0][:,None]

In [None]:
Psi

array([[-0.12034435],
       [-0.12535607],
       [-0.12678467],
       [-0.11365465],
       [-0.15428619],
       [-0.14153438],
       [-0.19612016],
       [-0.11828612],
       [-0.09528986],
       [-0.13968733],
       [-0.11527359],
       [-0.11980284],
       [-0.14063898],
       [-0.12003775],
       [-0.15145852],
       [-0.12880495],
       [-0.10371999],
       [-0.12881515],
       [-0.15774031],
       [-0.12426352],
       [-0.1480765 ],
       [-0.12214199],
       [-0.14555795],
       [-0.09824333],
       [-0.15017927],
       [-0.1310164 ],
       [-0.12172908],
       [-0.10254376],
       [-0.13397605],
       [-0.15690622],
       [-0.17340745],
       [-0.15188567],
       [-0.11551266],
       [-0.1739793 ],
       [-0.15012231],
       [-0.13521728],
       [-0.1880746 ],
       [-0.18754513],
       [-0.13749341],
       [-0.14289501],
       [-0.1787747 ],
       [-0.16580453],
       [-0.11374979],
       [-0.21222729],
       [-0.11577545],
       [-0

In [None]:
Psi.shape

(48, 1)

In [None]:
Y_pred_svd = Psi.T@Psi@Y

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 48 is different from 1)

In [None]:
Psi.shape

(48, 1)

In [None]:
Y.shape

(48, 5646)

In [None]:
Y_pred_svd = Psi@Psi.T@Y

In [None]:
Y_pred_svd

array([[ 24.67051479,  34.49577237, 118.69676981, ...,  45.16994753,
         40.87638686,  35.00294726],
       [ 25.69791427,  35.93234307, 123.63987704, ...,  47.05104248,
         42.57867718,  36.46063917],
       [ 25.99077717,  36.34184129, 125.04892262, ...,  47.5872535 ,
         43.06391948,  36.87615804],
       ...,
       [ 36.21049662,  50.63165724, 174.21886078, ...,  66.29882863,
         59.99689429,  51.37607034],
       [ 34.4730385 ,  48.20224057, 165.85946219, ...,  63.11766712,
         57.11811325,  48.91093512],
       [ 37.70739626,  52.72471082, 181.42086501, ...,  69.03954478,
         62.47709583,  53.49989708]])

In [None]:
Y

array([[  95.16904348,   57.45346009,  225.38893053, ...,  -64.75616199,
         188.8945732 ,  -33.70269427],
       [ 464.5921978 ,  -63.37621986,  169.13093796, ...,  111.07063995,
         168.79429644,  -10.37157683],
       [ 187.74074729,   13.98971183,  -69.57542164, ..., -198.24823055,
        -112.60513915,  291.71118672],
       ...,
       [  78.24423448,  372.68025681,  318.44314118, ...,  -35.38301754,
         129.08786429,   58.12187412],
       [ 578.24168404,   -4.43048687,  105.98514885, ..., -114.6170377 ,
          58.4893689 ,  104.35248976],
       [ 330.75844218,  153.60758373,   92.6357783 , ...,  423.31768081,
         -41.86260281,  148.05078803]])

In [None]:
X

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
Y

array([[  95.16904348,   57.45346009,  225.38893053, ...,  -64.75616199,
         188.8945732 ,  -33.70269427],
       [ 464.5921978 ,  -63.37621986,  169.13093796, ...,  111.07063995,
         168.79429644,  -10.37157683],
       [ 187.74074729,   13.98971183,  -69.57542164, ..., -198.24823055,
        -112.60513915,  291.71118672],
       ...,
       [  78.24423448,  372.68025681,  318.44314118, ...,  -35.38301754,
         129.08786429,   58.12187412],
       [ 578.24168404,   -4.43048687,  105.98514885, ..., -114.6170377 ,
          58.4893689 ,  104.35248976],
       [ 330.75844218,  153.60758373,   92.6357783 , ...,  423.31768081,
         -41.86260281,  148.05078803]])

In [None]:
Y_pred_svd

array([[ 24.67051479,  34.49577237, 118.69676981, ...,  45.16994753,
         40.87638686,  35.00294726],
       [ 25.69791427,  35.93234307, 123.63987704, ...,  47.05104248,
         42.57867718,  36.46063917],
       [ 25.99077717,  36.34184129, 125.04892262, ...,  47.5872535 ,
         43.06391948,  36.87615804],
       ...,
       [ 36.21049662,  50.63165724, 174.21886078, ...,  66.29882863,
         59.99689429,  51.37607034],
       [ 34.4730385 ,  48.20224057, 165.85946219, ...,  63.11766712,
         57.11811325,  48.91093512],
       [ 37.70739626,  52.72471082, 181.42086501, ...,  69.03954478,
         62.47709583,  53.49989708]])

In [None]:
X

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
noise = 1

In [None]:
Y = X+noise*Z

In [None]:
Y

array([[68.12573022, 85.86789514, 87.64042265, ..., 60.41803803,
        55.61962411, 69.52009489],
       [58.88621499, 71.37351928, 81.40784367, ..., 53.26873358,
        58.51272292, 42.75301228],
       [58.60040144, 78.6991518 , 82.2939265 , ..., 43.87431982,
        50.24288427, 51.1185672 ],
       ...,
       [59.08905657, 50.49789558, 49.25153011, ..., 77.47529725,
        79.23179168, 80.89412686],
       [76.33347996, 66.66944103, 54.24057175, ..., 79.09937046,
        78.90508292, 84.09418525],
       [80.1650629 , 69.39153864, 58.16028404, ..., 88.55637781,
        88.39440695, 87.28252482]])

In [None]:
X

array([[68., 86., 87., ..., 61., 55., 70.],
       [57., 72., 81., ..., 53., 58., 43.],
       [58., 79., 83., ..., 45., 51., 50.],
       ...,
       [59., 49., 48., ..., 78., 79., 81.],
       [74., 67., 54., ..., 80., 79., 84.],
       [79., 69., 58., ..., 87., 89., 87.]])

In [None]:
U,sing_vals,Vt = np.linalg.svd(Y,full_matrices=False)

In [None]:
sing_vals

array([28456.4364957 ,  5817.78123182,  2848.41744939,  2521.26664527,
        1931.43277349,  1641.2792981 ,  1504.90991761,  1481.4631768 ,
        1384.92909328,  1331.07892568,  1304.96294759,  1154.38636109,
        1094.62909834,  1060.14515863,  1003.3802734 ,   969.05921531,
         957.33987345,   911.85369666,   900.14263579,   857.51273272,
         840.08732906,   831.62450362,   802.63318022,   777.34589876,
         756.44418762,   734.07396716,   714.34535052,   696.91844309,
         685.42515176,   672.12070969,   662.04122357,   643.11565803,
         618.5130138 ,   613.06691629,   597.425414  ,   590.24026183,
         569.4014545 ,   563.88526402,   561.69833388,   547.01046037,
         544.362665  ,   533.39579405,   516.63377507,   497.80499261,
         491.11153048,   483.57798545,   472.81945992,   455.72100341])

In [None]:
c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
sing_val_threshold = omega*np.median(sing_vals)

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

11

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sing_val_threshold = int(lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape)))

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

47

In [None]:
np.median(sing_vals)

766.8950431906712

In [None]:
np.mean(sing_vals)

1595.9427276650865

In [None]:
np.median(sing_vals)

766.8950431906712

In [None]:
sing_val_threshold = omega*np.median(sing_vals)

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

11

In [None]:
sing_val_threshold

1108.4736093609672

In [None]:
sing_vals

array([28456.4364957 ,  5817.78123182,  2848.41744939,  2521.26664527,
        1931.43277349,  1641.2792981 ,  1504.90991761,  1481.4631768 ,
        1384.92909328,  1331.07892568,  1304.96294759,  1154.38636109,
        1094.62909834,  1060.14515863,  1003.3802734 ,   969.05921531,
         957.33987345,   911.85369666,   900.14263579,   857.51273272,
         840.08732906,   831.62450362,   802.63318022,   777.34589876,
         756.44418762,   734.07396716,   714.34535052,   696.91844309,
         685.42515176,   672.12070969,   662.04122357,   643.11565803,
         618.5130138 ,   613.06691629,   597.425414  ,   590.24026183,
         569.4014545 ,   563.88526402,   561.69833388,   547.01046037,
         544.362665  ,   533.39579405,   516.63377507,   497.80499261,
         491.11153048,   483.57798545,   472.81945992,   455.72100341])

In [None]:
sing_val_threshold

1108.4736093609672

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

11

In [None]:
X = rng.random(size=(10,10))

In [None]:
Z = rng.standard_normal(size=(10,10))

In [None]:
X

array([[0.10186647, 0.40782774, 0.87289561, 0.11790831, 0.16177086,
        0.04194028, 0.18061318, 0.50067877, 0.05813152, 0.95551892],
       [0.35299941, 0.8786263 , 0.89566036, 0.74869773, 0.91633954,
        0.02749271, 0.49587352, 0.71296785, 0.13547264, 0.41437353],
       [0.59617335, 0.40192685, 0.41168942, 0.327103  , 0.68097894,
        0.97119811, 0.23414406, 0.25469538, 0.71087079, 0.99985472],
       [0.81922049, 0.93214337, 0.53380776, 0.76826884, 0.91914922,
        0.26507948, 0.98667707, 0.93522698, 0.01239166, 0.64897035],
       [0.75755508, 0.95668616, 0.9628079 , 0.64835284, 0.7791146 ,
        0.37377191, 0.8298038 , 0.1277525 , 0.81117284, 0.50974694],
       [0.58295559, 0.16392991, 0.41225434, 0.89800793, 0.6398117 ,
        0.6697343 , 0.02006882, 0.69949428, 0.42769145, 0.01451264],
       [0.63390043, 0.35489847, 0.38931791, 0.0204773 , 0.26125328,
        0.69832169, 0.74885055, 0.18311796, 0.09670195, 0.48358575],
       [0.12614437, 0.49397433, 0.1383186

In [None]:
noise = 2

In [None]:
Y = X+noise*Z

In [None]:
U,sing_vals,Vt = np.linalg.svd(Y,full_matrices=False)

In [None]:
sing_vals

array([11.48035097, 10.08514064,  8.33414378,  7.0823964 ,  5.99041788,
        4.41866457,  3.82408285,  3.12988612,  1.16585823,  0.23908889])

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)
sing_val_threshold = lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0]

IndexError: index -1 is out of bounds for axis 0 with size 0

In [None]:
sing_val_threshold

215.16349824122585

In [None]:
sing_vals

array([11.48035097, 10.08514064,  8.33414378,  7.0823964 ,  5.99041788,
        4.41866457,  3.82408285,  3.12988612,  1.16585823,  0.23908889])

In [None]:
lambda_beta*noise*np.sqrt(max(Y.shape))

9.055202352694836

In [None]:
np.argwhere(sing_vals>=9.055)

array([[0],
       [1]], dtype=int64)

In [None]:
np.argwhere(sing_vals>=9.055)[-1][0]

1

In [None]:
s = 1

In [None]:
Psi = U[:,:s]

In [None]:
Psi

array([[-0.33101352],
       [-0.31644685],
       [-0.31626177],
       [-0.62740921],
       [ 0.00285527],
       [-0.39134299],
       [-0.32399259],
       [-0.03693437],
       [-0.13622621],
       [-0.13629822]])

In [None]:
Psi = U[:,:2]

In [None]:
Psi.shape

(10, 2)

In [None]:
Psi@Psi.T@Y

array([[-0.12476645, -0.26184253,  2.18104908,  0.03789454,  1.44499586,
         2.15648619,  2.68658102,  0.69626772, -0.58554678,  1.26345787],
       [ 0.90803783,  0.46959065,  1.34820711, -0.012431  ,  1.47467246,
         2.15896577,  1.35582722,  1.1422445 ,  0.53780547, -0.35870037],
       [ 2.4673982 ,  1.56244081,  0.22855407, -0.08630683,  1.61542593,
         2.30556429, -0.48608884,  1.86528003,  2.20408271, -2.73718027],
       [ 1.78197442,  0.91817393,  2.6862195 , -0.02377678,  2.92211952,
         4.27877281,  2.70983035,  2.25617252,  1.04667025, -0.68318113],
       [ 0.32310014,  0.22792326, -0.24979176, -0.0155793 ,  0.01677087,
         0.01192293, -0.40325543,  0.14339543,  0.34910198, -0.501955  ],
       [ 1.26214463,  0.67827545,  1.567459  , -0.02196597,  1.83633244,
         2.6831405 ,  1.51243331,  1.47716739,  0.81380744, -0.65585468],
       [-1.35513309, -1.12034701,  3.01919231,  0.09549152,  1.30240686,
         1.99386964,  4.08490951,  0.10944958

In [None]:
Y_pred_svd = Psi@Psi.T@Y

In [None]:
mean_squared_error(X,Y)

3.770620980381664

In [None]:
mean_squared_error(X,Y_pred_svd)

1.911574655877629

In [None]:
(4/np.sqrt(3))*noise*np.sqrt(X.shape[0])

14.605934866804432

In [None]:
sing_vals

array([11.48035097, 10.08514064,  8.33414378,  7.0823964 ,  5.99041788,
        4.41866457,  3.82408285,  3.12988612,  1.16585823,  0.23908889])

In [None]:
lambda_beta

1.4317532054115514

In [None]:
beta = 1

In [None]:
beta

1

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)

In [None]:
lambda_beta

2.309401076758503

In [None]:
4/np.sqrt(3)

2.3094010767585034

In [None]:
sing_val_threshold = lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))

In [None]:
sing_val_threshold

347.05619141574175

In [None]:
np.argwhere(sing_vals>=sing_val_threshold)[-1][0] + 1

IndexError: index -1 is out of bounds for axis 0 with size 0

In [None]:
abs_path = os.path.dirname(os.path.realpath(__file__))
files_path = os.path.abspath(os.path.join(abs_path,os.pardir)) + '/files/catalonia/'
results_path = os.path.abspath(os.path.join(abs_path,os.pardir)) + '/test/'

In [None]:
pollutant = 'O3'
start_date = '2011-01-01'
end_date = '2022-12-31'
N=48
dataset = Dataset(pollutant,N,start_date,end_date,files_path)
dataset.load_dataset()
dataset.check_dataset()
dataset.sort_stations(station_center='Ciutadella')

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv
Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juned

In [None]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10
X_train, X_test = train_test_split(dataset.ds, test_size= 1 - train_ratio,shuffle=False,random_state=92)
X_val, X_test = train_test_split(X_test, test_size=test_ratio/(test_ratio + validation_ratio),shuffle=False,random_state=92) 
print(f'Dataset matrix summary:\n {train_ratio} of dataset for training set with {X_train.shape[0]} measurements from {X_train.index[0]} until {X_train.index[-1]}\n {validation_ratio} of dataset for validation set with {X_val.shape[0]} measurements from {X_val.index[0]} until {X_val.index[-1]}\n {test_ratio} of measuerements for testing set with {X_test.shape[0]} measurements from {X_test.index[0]} until {X_test.index[-1]}')

Dataset matrix summary:
 0.75 of dataset for training set with 5646 measurements from 2020-09-24 13:00:00 until 2022-05-07 12:00:00
 0.15 of dataset for validation set with 1129 measurements from 2022-05-07 13:00:00 until 2022-09-20 07:00:00
 0.1 of measuerements for testing set with 753 measurements from 2022-09-20 15:00:00 until 2022-12-31 23:00:00


In [None]:
snapshots_matrix_train = X_train.to_numpy().T
snapshots_matrix_val = X_val.to_numpy().T
snapshots_matrix_test = X_test.to_numpy().T
snapshots_matrix_train_centered = snapshots_matrix_train - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_val_centered = snapshots_matrix_val - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_test_centered = snapshots_matrix_test - snapshots_matrix_train.mean(axis=1)[:,None]
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)
print(f'Training snapshots matrix has dimensions {snapshots_matrix_train_centered.shape}.\nLeft singular vectors matrix has dimensions {U.shape}\nRight singular vectors matrix has dimensions {Vt.shape}\nNumber of singular values: {sing_vals.shape}')

Training snapshots matrix has dimensions (48, 5646).
Left singular vectors matrix has dimensions (48, 48)
Right singular vectors matrix has dimensions (48, 5646)
Number of singular values: (48,)


In [None]:
snapshots_matrix_train = X_train.to_numpy().T
snapshots_matrix_val = X_val.to_numpy().T
snapshots_matrix_test = X_test.to_numpy().T
snapshots_matrix_train_centered = snapshots_matrix_train - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_val_centered = snapshots_matrix_val - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_test_centered = snapshots_matrix_test - snapshots_matrix_train.mean(axis=1)[:,None]
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)
print(f'Training snapshots matrix has dimensions {snapshots_matrix_train_centered.shape}.\nLeft singular vectors matrix has dimensions {U.shape}\nRight singular vectors matrix has dimensions {Vt.shape}\nNumber of singular values: {sing_vals.shape}')

Training snapshots matrix has dimensions (48, 5646).
Left singular vectors matrix has dimensions (48, 48)
Right singular vectors matrix has dimensions (48, 5646)
Number of singular values: (48,)


In [None]:
t = np.arange(-3,3,.01)

In [None]:
t

array([-3.00000000e+00, -2.99000000e+00, -2.98000000e+00, -2.97000000e+00,
       -2.96000000e+00, -2.95000000e+00, -2.94000000e+00, -2.93000000e+00,
       -2.92000000e+00, -2.91000000e+00, -2.90000000e+00, -2.89000000e+00,
       -2.88000000e+00, -2.87000000e+00, -2.86000000e+00, -2.85000000e+00,
       -2.84000000e+00, -2.83000000e+00, -2.82000000e+00, -2.81000000e+00,
       -2.80000000e+00, -2.79000000e+00, -2.78000000e+00, -2.77000000e+00,
       -2.76000000e+00, -2.75000000e+00, -2.74000000e+00, -2.73000000e+00,
       -2.72000000e+00, -2.71000000e+00, -2.70000000e+00, -2.69000000e+00,
       -2.68000000e+00, -2.67000000e+00, -2.66000000e+00, -2.65000000e+00,
       -2.64000000e+00, -2.63000000e+00, -2.62000000e+00, -2.61000000e+00,
       -2.60000000e+00, -2.59000000e+00, -2.58000000e+00, -2.57000000e+00,
       -2.56000000e+00, -2.55000000e+00, -2.54000000e+00, -2.53000000e+00,
       -2.52000000e+00, -2.51000000e+00, -2.50000000e+00, -2.49000000e+00,
       -2.48000000e+00, -

In [None]:
np.cos(17*t)

array([ 7.42154197e-01,  8.44846817e-01,  9.23182111e-01,  9.74901633e-01,
        9.98514287e-01,  9.93339308e-01,  9.59525894e-01,  8.98048902e-01,
        8.10680741e-01,  6.99940276e-01,  5.69020206e-01,  4.21695019e-01,
        2.62212168e-01,  9.51696180e-02, -7.46167166e-02, -2.42251816e-01,
       -4.02902684e-01, -5.51937678e-01, -6.85060053e-01, -7.98431826e-01,
       -8.88784438e-01, -9.53512980e-01, -9.90751299e-01, -9.99425796e-01,
       -9.79286381e-01, -9.30913683e-01, -8.55702309e-01, -7.55820640e-01,
       -6.34148308e-01, -4.94193186e-01, -3.39990243e-01, -1.75985224e-01,
       -6.90646804e-03,  1.62371404e-01,  3.26968034e-01,  4.82138022e-01,
        6.23407746e-01,  7.46704335e-01,  8.48473089e-01,  9.25779969e-01,
        9.76396180e-01,  9.98862435e-01,  9.92531020e-01,  9.57584473e-01,
        8.95030319e-01,  8.06672024e-01,  6.95056998e-01,  5.63403155e-01,
        4.15506136e-01,  2.55629882e-01,  8.83836993e-02, -8.14106268e-02,
       -2.48857847e-01, -

In [None]:
np.cos(17*t).shape

(600,)

In [None]:
Utrue = np.array([np.cos(17*t)*np.exp(-t**2),np.sin(11*t)]).T

In [None]:
Utrue.shape

(600, 2)

In [None]:
t.shape

(600,)

In [None]:
Strue = np.array([[2,0],[0,0.5]])

In [None]:
S_true

NameError: name 'S_true' is not defined

In [None]:
Strue

array([[2. , 0. ],
       [0. , 0.5]])

In [None]:
Vtrue = np.array([np.sin(5*t)*np.exp(-t**2),np.cos(13*t)]).T

In [None]:
Vtrue.shape

(600, 2)

In [None]:
X = Utrue@Strue@Vtrue.T

In [None]:
X.shape

(600, 600)

In [None]:
plt.imshow(X)

<matplotlib.image.AxesImage at 0x22f1916d650>

In [None]:
plt.set_cmap('gray')

In [None]:
pl.show()

NameError: name 'pl' is not defined

In [None]:
sigma = 1

In [None]:
Z = np.random.randn(*X.shape)

In [None]:
Z

array([[-0.35446188, -0.05586612,  0.54449806, ..., -0.45810684,
         0.26864746,  0.40126865],
       [-0.30001117, -2.31445156, -0.48211481, ...,  0.27671388,
         0.29238106, -0.39383658],
       [-0.30004989,  1.04700004,  2.43779219, ..., -1.60828999,
        -0.42685983, -0.22174075],
       ...,
       [ 0.25809518,  1.74686094, -1.00337416, ...,  0.18842768,
        -0.17495119, -2.30986075],
       [-0.82308761, -2.19857818,  0.88482936, ..., -0.33514208,
         0.19612271, -0.5641815 ],
       [-0.0461484 ,  0.91878453,  0.88688257, ..., -1.1044723 ,
        -1.09400519, -0.71205586]])

In [None]:
*X.shape

SyntaxError: can't use starred expression here (<ipython-input-593-b12577387d21>, line 1)

In [None]:
Xnoisy = X+sigma*Z

In [None]:
plt.imshow(Xnoisy)

<matplotlib.image.AxesImage at 0x22f1d85d650>

In [None]:
plt.set_cmap('gray')

In [None]:
mean_squared_error(Xtrue,Xnoisy)

NameError: name 'Xtrue' is not defined

In [None]:
mean_squared_error(X,Xnoisy)

1.0002968783133455

In [None]:
sigma

1

In [None]:
error = X - Xnoisy

In [None]:
error.var(axis=0)

array([0.84220985, 0.98377368, 0.95484659, 0.96166736, 0.88394475,
       0.97576372, 1.07592259, 0.9990873 , 1.02270451, 1.03416023,
       0.98352193, 0.96967989, 1.04656221, 1.05442172, 0.99028475,
       0.99596914, 0.96546945, 0.85936   , 0.92958955, 1.06375341,
       1.07796238, 1.00385584, 1.00091405, 0.9677384 , 1.01795831,
       0.95555833, 0.97221666, 0.95378017, 0.91480097, 0.90841288,
       0.92865704, 0.86499194, 0.93878286, 0.91276049, 0.90838055,
       0.97704635, 0.93083784, 1.10633689, 1.08938272, 0.98690308,
       0.96840058, 0.95533111, 0.90176341, 1.01818897, 0.94398791,
       0.98551595, 1.00410954, 1.06912365, 0.94963772, 1.01712184,
       1.00059876, 0.95957026, 0.89481916, 0.9741439 , 0.94671166,
       0.9420462 , 1.01475299, 0.97941255, 1.02733658, 0.91485605,
       1.00786341, 1.05569782, 0.94800635, 1.04215724, 1.01325311,
       1.01928389, 1.02614479, 0.93759507, 0.94927326, 1.04612532,
       0.94327596, 0.9328928 , 1.02361555, 1.04327243, 0.87948

In [None]:
X.shape

(600, 600)

In [None]:
error.var(axis=0)

array([0.84220985, 0.98377368, 0.95484659, 0.96166736, 0.88394475,
       0.97576372, 1.07592259, 0.9990873 , 1.02270451, 1.03416023,
       0.98352193, 0.96967989, 1.04656221, 1.05442172, 0.99028475,
       0.99596914, 0.96546945, 0.85936   , 0.92958955, 1.06375341,
       1.07796238, 1.00385584, 1.00091405, 0.9677384 , 1.01795831,
       0.95555833, 0.97221666, 0.95378017, 0.91480097, 0.90841288,
       0.92865704, 0.86499194, 0.93878286, 0.91276049, 0.90838055,
       0.97704635, 0.93083784, 1.10633689, 1.08938272, 0.98690308,
       0.96840058, 0.95533111, 0.90176341, 1.01818897, 0.94398791,
       0.98551595, 1.00410954, 1.06912365, 0.94963772, 1.01712184,
       1.00059876, 0.95957026, 0.89481916, 0.9741439 , 0.94671166,
       0.9420462 , 1.01475299, 0.97941255, 1.02733658, 0.91485605,
       1.00786341, 1.05569782, 0.94800635, 1.04215724, 1.01325311,
       1.01928389, 1.02614479, 0.93759507, 0.94927326, 1.04612532,
       0.94327596, 0.9328928 , 1.02361555, 1.04327243, 0.87948

In [None]:
error.var(axis=0).mean()

0.9987486584255093

In [None]:
error.mean().mean()

0.0006814631504687083

In [None]:
(error**2).mean().mean()

1.0002968783133452

In [None]:
error.var(axis=1,ddof=0).mean()

0.998659872649295

In [None]:
error.var(axis=0,ddof=0).mean()

0.9987486584255093

In [None]:
error.var(axis=1,ddof=1).mean()

1.000327084456723

In [None]:
mean_squared_error(X,Xnoisy)

1.0002968783133455

In [None]:
U,S,VT = np.linalg.svd(Xnoisy,full_matrices=False)

In [None]:
N = Xnoisy.shape[0]

In [None]:
N

600

In [None]:
cutoff = (4/np.sqrt(3))*np.sqrt(N)*sigma

In [None]:
cutoff

56.56854249492381

In [None]:
S

array([1.55593245e+02, 1.30257468e+02, 4.81772752e+01, 4.79172762e+01,
       4.76087063e+01, 4.75032986e+01, 4.71871433e+01, 4.69610139e+01,
       4.68661012e+01, 4.66278385e+01, 4.64821105e+01, 4.62537677e+01,
       4.60441682e+01, 4.57714481e+01, 4.55465832e+01, 4.53702264e+01,
       4.52458268e+01, 4.52241670e+01, 4.50018353e+01, 4.47264644e+01,
       4.46663909e+01, 4.44252995e+01, 4.43610098e+01, 4.42998225e+01,
       4.40541965e+01, 4.39131730e+01, 4.37433939e+01, 4.36598257e+01,
       4.35134968e+01, 4.33849128e+01, 4.31765639e+01, 4.31083638e+01,
       4.29291204e+01, 4.27536611e+01, 4.26318987e+01, 4.26204084e+01,
       4.24106353e+01, 4.22337586e+01, 4.21642606e+01, 4.20362963e+01,
       4.18680779e+01, 4.17773633e+01, 4.16715090e+01, 4.15409003e+01,
       4.13700429e+01, 4.12262054e+01, 4.11428503e+01, 4.10484781e+01,
       4.09077250e+01, 4.08235763e+01, 4.07501301e+01, 4.06592934e+01,
       4.04995999e+01, 4.02829352e+01, 4.02396732e+01, 4.01443202e+01,
      

In [None]:
np.where(S>cutoff)

(array([0, 1], dtype=int64),)

In [None]:
S

array([1.55593245e+02, 1.30257468e+02, 4.81772752e+01, 4.79172762e+01,
       4.76087063e+01, 4.75032986e+01, 4.71871433e+01, 4.69610139e+01,
       4.68661012e+01, 4.66278385e+01, 4.64821105e+01, 4.62537677e+01,
       4.60441682e+01, 4.57714481e+01, 4.55465832e+01, 4.53702264e+01,
       4.52458268e+01, 4.52241670e+01, 4.50018353e+01, 4.47264644e+01,
       4.46663909e+01, 4.44252995e+01, 4.43610098e+01, 4.42998225e+01,
       4.40541965e+01, 4.39131730e+01, 4.37433939e+01, 4.36598257e+01,
       4.35134968e+01, 4.33849128e+01, 4.31765639e+01, 4.31083638e+01,
       4.29291204e+01, 4.27536611e+01, 4.26318987e+01, 4.26204084e+01,
       4.24106353e+01, 4.22337586e+01, 4.21642606e+01, 4.20362963e+01,
       4.18680779e+01, 4.17773633e+01, 4.16715090e+01, 4.15409003e+01,
       4.13700429e+01, 4.12262054e+01, 4.11428503e+01, 4.10484781e+01,
       4.09077250e+01, 4.08235763e+01, 4.07501301e+01, 4.06592934e+01,
       4.04995999e+01, 4.02829352e+01, 4.02396732e+01, 4.01443202e+01,
      

In [None]:
cutoff

56.56854249492381

In [None]:
np.where(S>cutoff)

(array([0, 1], dtype=int64),)

In [None]:
np.max(np.where(S>cutoff))

1

In [None]:
r = np.max(np.where(S>cutoff))

In [None]:
Xclean = U[:,:(r+1)]@np.diag(S[:(r+1)])@VT[:(r+1),:]

In [None]:
plt.imshow(Xclean)

<matplotlib.image.AxesImage at 0x22f1d90dd90>

In [None]:
plt.set_cmap('gray')

In [None]:
cdS = np.cumsum(S)/np.sum(S)

In [None]:
r90 = np.min(np.where(cdS>0.90))

In [None]:
r90

401

In [None]:
X90 = U[:,:(r90+1)]@np.diag(S[:(r90+1)])@VT[:(r90+1),:]

In [None]:
X90

array([[-0.53420027,  0.08517644,  0.36265568, ..., -0.62166513,
         0.02023929,  0.44767392],
       [-0.3355576 , -2.32283341, -0.82928927, ...,  0.00572195,
        -0.04537221, -0.61011196],
       [-0.28222935,  0.94977401,  2.27532881, ..., -1.7338194 ,
        -0.71479906, -0.60228571],
       ...,
       [ 0.30523974,  1.55570836, -0.89616022, ...,  0.62822147,
         0.01168559, -1.93994237],
       [-0.90783976, -2.07884223,  0.913335  , ...,  0.12840733,
         0.71591043, -0.32193195],
       [-0.04964628,  1.16631139,  1.02437077, ..., -0.98016609,
        -0.46301736, -0.54590359]])

In [None]:
plt.imshow(X90)

<matplotlib.image.AxesImage at 0x22f1df2a150>

In [None]:
plt.set_cmap('gray')

In [None]:
plt.plot(S)

[<matplotlib.lines.Line2D at 0x22f1dfb8c50>]

In [None]:
plt.semilogy(S)

[<matplotlib.lines.Line2D at 0x22f1e0aa6d0>]

In [None]:
mean_squared_error(X,Xnoisy)

1.0002968783133455

In [None]:
mean_squared_error(X,Xclean)

0.006956486773803818

In [None]:
mean_squared_error(X,X90)

0.9703086893992201

In [None]:
mean_squared_error(Xclean,X)

0.006956486773803818

In [None]:
error = Xclean - X

In [None]:
error.var(axis=0,ddof=0)

array([2.46701190e-03, 1.86877341e-03, 8.30677174e-03, 6.00019480e-03,
       5.46868185e-03, 2.50325326e-03, 2.54997421e-03, 5.03623287e-03,
       4.88159893e-03, 5.54417687e-03, 9.50793471e-03, 5.72017342e-03,
       1.42394633e-02, 1.08573803e-02, 4.43431702e-03, 3.83494965e-03,
       1.03949596e-02, 4.19618714e-03, 5.90184607e-03, 1.00147025e-02,
       2.08878445e-03, 3.08747002e-03, 1.50315229e-03, 4.23744473e-03,
       4.43628296e-03, 2.02775245e-03, 1.04513861e-03, 6.46086605e-03,
       2.08136735e-03, 1.71822330e-02, 2.85058467e-03, 2.95857888e-03,
       6.28101298e-03, 6.23067004e-03, 5.29722841e-03, 8.44404398e-03,
       3.81461186e-03, 9.99450135e-03, 9.22970874e-03, 5.17057130e-03,
       2.00824570e-03, 3.84828374e-03, 3.02156303e-03, 2.09370656e-03,
       3.98436024e-03, 3.85549119e-04, 1.69325234e-03, 9.29354173e-04,
       2.99310896e-03, 5.09557408e-03, 3.21740473e-03, 1.59684564e-02,
       3.46236929e-03, 4.12953516e-03, 4.31271075e-03, 4.31031985e-03,
      

In [None]:
error.var(axis=0,ddof=0).mean()

0.006955924931034872

In [None]:
error.var(axis=1,ddof=0).mean()

0.00695276679807832

In [None]:
mean_squared_error(Xclean,X)

0.006956486773803818

In [None]:
mean_squared_error(Xnoisy,X)

1.0002968783133455

In [None]:
mean_squared_error(X,Xnoisy)

1.0002968783133455

In [None]:
mean_squared_error(Xclean,Xnoisy)

0.9934791344457543

In [None]:
mean_squared_error(Xclean,X)

0.006956486773803818

In [None]:
np.sqrt(mean_squared_error(Xclean,X))

0.083405556012797

In [None]:
snapshots_matrix_train = X_train.to_numpy().T
snapshots_matrix_val = X_val.to_numpy().T
snapshots_matrix_test = X_test.to_numpy().T
snapshots_matrix_train_centered = snapshots_matrix_train - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_val_centered = snapshots_matrix_val - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_test_centered = snapshots_matrix_test - snapshots_matrix_train.mean(axis=1)[:,None]
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)
print(f'Training snapshots matrix has dimensions {snapshots_matrix_train_centered.shape}.\nLeft singular vectors matrix has dimensions {U.shape}\nRight singular vectors matrix has dimensions {Vt.shape}\nNumber of singular values: {sing_vals.shape}')

Training snapshots matrix has dimensions (48, 5646).
Left singular vectors matrix has dimensions (48, 48)
Right singular vectors matrix has dimensions (48, 5646)
Number of singular values: (48,)


In [None]:
s = 36

In [None]:
Psi = U[:,:s]

In [None]:
Psi.shape

(48, 36)

In [None]:
cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)

In [None]:
cumulative_energy[s]

0.9266528524452271

In [None]:
cumulative_energy[s-1]

0.9192565753256522

In [None]:
np.where(cumulative_energy>0.9)

(array([33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
       dtype=int64),)

In [None]:
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train_centered,full_matrices=False)

In [None]:
cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)

In [None]:
np.where(cumulative_energy>0.9)

(array([36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=int64),)

In [None]:
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)

In [None]:
cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)

In [None]:
np.where(cumulative_energy>0.9)

(array([33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
       dtype=int64),)

In [None]:
cumulative_energy[33]

0.9038588720120724

In [None]:
s = 33

In [None]:
s

33

In [None]:
Psi = U[:,:s]

In [None]:
snapshots_matrix_pred_svd = Psi@Psi.T@snapshots_matrix_train

In [None]:
snapshots_matrix_pred_svd

array([[68.66772609, 86.49895705, 87.09423734, ..., 59.02904448,
        56.57470588, 62.47872125],
       [54.65687854, 74.48824078, 78.75658561, ..., 48.73337652,
        52.00705963, 49.80722095],
       [59.49491813, 80.4566341 , 86.24885494, ..., 54.0937201 ,
        56.23017247, 56.13541987],
       ...,
       [61.69344419, 52.82973943, 49.03178604, ..., 74.15794185,
        80.94210836, 82.41643643],
       [72.64077492, 65.80299566, 53.68312999, ..., 80.70119494,
        79.71510058, 84.89256859],
       [78.80857672, 67.57042266, 58.27216712, ..., 89.23382247,
        89.2936192 , 87.89431153]])

In [None]:
error = snapshots_matrix_train - snapshots_matrix_pred_svd

In [None]:
error.var(axis=1,ddof=0)

array([12.39910303, 27.55967844, 27.79676169, 21.10645645, 14.68785042,
       27.38684651, 12.99838751, 21.31418239, 24.62931366,  5.26845959,
       27.5751316 , 38.25863694, 23.21801884, 10.67340145, 15.99803302,
       23.30764875, 20.18298799,  5.42330648,  2.90037937, 26.28057788,
        7.55370693, 18.1361257 ,  4.36638833, 28.99334814,  9.9155426 ,
       18.31298191, 26.06241954, 16.55579787, 12.41763078,  7.29507   ,
       12.09902736, 34.75487297,  1.01351348, 26.47467707, 21.99724984,
        3.90259009,  5.70719283,  6.90254433,  1.18866172, 25.76137962,
       18.84474363,  1.84905648, 21.89249633,  6.00417757,  0.46997852,
       19.96582665,  1.29728499,  4.39096724])

In [None]:
error.var(axis=1,ddof=0).shape

(48,)

In [None]:
error.var(axis=1,ddof=0).mean()

15.689383636350188

In [None]:
np.sqrt(error.var(axis=1,ddof=0).mean())

3.960982660445535

In [None]:
ppb

1.96

In [None]:
ppb**2

3.8415999999999997

In [None]:
mean_squared_error(snapshots_matrix_train,snapshots_matrix_pred_svd)

15.690298459024168

In [None]:
error.var(axis=1,ddof=0)

array([12.39910303, 27.55967844, 27.79676169, 21.10645645, 14.68785042,
       27.38684651, 12.99838751, 21.31418239, 24.62931366,  5.26845959,
       27.5751316 , 38.25863694, 23.21801884, 10.67340145, 15.99803302,
       23.30764875, 20.18298799,  5.42330648,  2.90037937, 26.28057788,
        7.55370693, 18.1361257 ,  4.36638833, 28.99334814,  9.9155426 ,
       18.31298191, 26.06241954, 16.55579787, 12.41763078,  7.29507   ,
       12.09902736, 34.75487297,  1.01351348, 26.47467707, 21.99724984,
        3.90259009,  5.70719283,  6.90254433,  1.18866172, 25.76137962,
       18.84474363,  1.84905648, 21.89249633,  6.00417757,  0.46997852,
       19.96582665,  1.29728499,  4.39096724])

In [None]:
error.var(axis=1,ddof=0).max()

38.25863694338593

In [None]:
ppb**2

3.8415999999999997

In [None]:
error.var(axis=1,ddof=0).sum()

753.090414544809

In [None]:
error.var(axis=1,ddof=0).sum()/error.shape[0]

15.689383636350188

In [None]:
error.shape

(48, 5646)

In [None]:
753.09/48

15.689375

In [None]:
error

array([[-0.66772609, -0.49895705, -0.09423734, ...,  1.97095552,
        -1.57470588,  7.52127875],
       [ 2.34312146, -2.48824078,  2.24341439, ...,  4.26662348,
         5.99294037, -6.80722095],
       [-1.49491813, -1.4566341 , -3.24885494, ..., -9.0937201 ,
        -5.23017247, -6.13541987],
       ...,
       [-2.69344419, -3.82973943, -1.03178604, ...,  3.84205815,
        -1.94210836, -1.41643643],
       [ 1.35922508,  1.19700434,  0.31687001, ..., -0.70119494,
        -0.71510058, -0.89256859],
       [ 0.19142328,  1.42957734, -0.27216712, ..., -2.23382247,
        -0.2936192 , -0.89431153]])

In [None]:
(error**2).mean()

15.690298459024165

In [None]:
(error**2).mean(axis=0)

array([ 7.11475648,  6.8033795 ,  7.55739459, ..., 14.93601017,
       10.18871428, 19.11943055])

In [None]:
(error**2).mean(axis=0).mean()

15.690298459024168

In [None]:
(error**2).mean(axis=1).mean()

15.690298459024168

In [None]:
(error**2).mean(axis=1)

array([12.40136891, 27.56405843, 27.79681034, 21.1066495 , 14.68881865,
       27.38754654, 13.00026996, 21.31648379, 24.63038846,  5.26846117,
       27.57883197, 38.26222402, 23.21845161, 10.67343526, 15.99809924,
       23.30767188, 20.18305461,  5.4233803 ,  2.9006168 , 26.28129937,
        7.55380764, 18.13643651,  4.36639142, 28.99465855,  9.9155516 ,
       18.31327177, 26.06242892, 16.55651054, 12.41864899,  7.29648551,
       12.10048953, 34.76092186,  1.01354307, 26.47479946, 21.99752996,
        3.90370618,  5.70749121,  6.90291617,  1.1886622 , 25.76407098,
       18.84494588,  1.84914257, 21.89490197,  6.00485637,  0.46998245,
       19.96582753,  1.29730667,  4.3911197 ])

In [None]:
(error**2).mean(axis=0)

array([ 7.11475648,  6.8033795 ,  7.55739459, ..., 14.93601017,
       10.18871428, 19.11943055])

In [None]:
(error**2).mean(axis=0).shape

(5646,)

In [None]:
(error**2).mean(axis=1)

array([12.40136891, 27.56405843, 27.79681034, 21.1066495 , 14.68881865,
       27.38754654, 13.00026996, 21.31648379, 24.63038846,  5.26846117,
       27.57883197, 38.26222402, 23.21845161, 10.67343526, 15.99809924,
       23.30767188, 20.18305461,  5.4233803 ,  2.9006168 , 26.28129937,
        7.55380764, 18.13643651,  4.36639142, 28.99465855,  9.9155516 ,
       18.31327177, 26.06242892, 16.55651054, 12.41864899,  7.29648551,
       12.10048953, 34.76092186,  1.01354307, 26.47479946, 21.99752996,
        3.90370618,  5.70749121,  6.90291617,  1.1886622 , 25.76407098,
       18.84494588,  1.84914257, 21.89490197,  6.00485637,  0.46998245,
       19.96582753,  1.29730667,  4.3911197 ])

In [None]:
(error**2).mean(axis=1).mean()

15.690298459024168

In [None]:
error.var(axis=1,ddof=0).mean()

15.689383636350188

In [None]:
(error**2).mean(axis=1)

array([12.40136891, 27.56405843, 27.79681034, 21.1066495 , 14.68881865,
       27.38754654, 13.00026996, 21.31648379, 24.63038846,  5.26846117,
       27.57883197, 38.26222402, 23.21845161, 10.67343526, 15.99809924,
       23.30767188, 20.18305461,  5.4233803 ,  2.9006168 , 26.28129937,
        7.55380764, 18.13643651,  4.36639142, 28.99465855,  9.9155516 ,
       18.31327177, 26.06242892, 16.55651054, 12.41864899,  7.29648551,
       12.10048953, 34.76092186,  1.01354307, 26.47479946, 21.99752996,
        3.90370618,  5.70749121,  6.90291617,  1.1886622 , 25.76407098,
       18.84494588,  1.84914257, 21.89490197,  6.00485637,  0.46998245,
       19.96582753,  1.29730667,  4.3911197 ])

In [None]:
(error**2).mean(axis=0)

array([ 7.11475648,  6.8033795 ,  7.55739459, ..., 14.93601017,
       10.18871428, 19.11943055])

In [None]:
(error**2).mean(axis=0).shape

(5646,)

In [None]:
pd.DataFrame((error**2).mean(axis=0),columns=[s])

Unnamed: 0,33
0,7.114756
1,6.803379
2,7.557395
3,7.206257
4,4.485485
...,...
5641,14.701003
5642,12.553409
5643,14.936010
5644,10.188714


In [None]:
error_variance = error.var(axis=1,ddof=0)# estimated coordiante error variance

In [None]:
error_variance_max = error_variance.max()

In [None]:
error_variance_max

38.25863694338593

In [None]:
error_variance

array([12.39910303, 27.55967844, 27.79676169, 21.10645645, 14.68785042,
       27.38684651, 12.99838751, 21.31418239, 24.62931366,  5.26845959,
       27.5751316 , 38.25863694, 23.21801884, 10.67340145, 15.99803302,
       23.30764875, 20.18298799,  5.42330648,  2.90037937, 26.28057788,
        7.55370693, 18.1361257 ,  4.36638833, 28.99334814,  9.9155426 ,
       18.31298191, 26.06241954, 16.55579787, 12.41763078,  7.29507   ,
       12.09902736, 34.75487297,  1.01351348, 26.47467707, 21.99724984,
        3.90259009,  5.70719283,  6.90254433,  1.18866172, 25.76137962,
       18.84474363,  1.84905648, 21.89249633,  6.00417757,  0.46997852,
       19.96582665,  1.29728499,  4.39096724])

In [None]:
error_variance.mean()

15.689383636350188

In [None]:
mse.mean()

4    133.45468
dtype: float64

In [None]:
mse

Unnamed: 0,4
2020-09-24 13:00:00,40.859863
2020-09-24 14:00:00,83.110628
2020-09-24 15:00:00,116.275952
2020-09-24 16:00:00,65.849279
2020-09-24 17:00:00,45.622997
...,...
2022-05-07 08:00:00,197.668997
2022-05-07 09:00:00,173.715170
2022-05-07 10:00:00,120.318898
2022-05-07 11:00:00,107.823840


In [None]:
mse = pd.DataFrame((error**2).mean(axis=0),columns=[s])

In [None]:
mse.mean()

33    15.690298
dtype: float64

In [None]:
error_variance.mean()

15.689383636350188

In [None]:
error_variance_max = error_variance.max()

In [None]:
error_variance_sparsity = pd.DataFrame()

In [None]:
error_variance_sparsity = pd.concat((error_variance_sparsity,error_variance_max),axis=0)

TypeError: cannot concatenate object of type '<class 'numpy.float64'>'; only Series and DataFrame objs are valid

In [None]:
error_variance_max = pd.DataFrame(error_variance.max())

ValueError: DataFrame constructor not properly called!

In [None]:
error_variance_max = pd.DataFrame(error_variance.max(),index=s)

ValueError: DataFrame constructor not properly called!

In [None]:
error_variance_max = pd.DataFrame([error_variance.max()],index=s)

TypeError: Index(...) must be called with a collection of some kind, 33 was passed

In [None]:
error_variance_max = pd.DataFrame([error_variance.max()],index=[s])

In [None]:
error_variance_max

Unnamed: 0,0
33,38.258637


In [None]:
error_variance_sparsity = pd.concat((error_variance_sparsity,error_variance_max),axis=0)

In [None]:
error_variance_sparsity

Unnamed: 0,0
33,38.258637


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 17 17:21:04 2023

@author: jparedes
"""
import os
import time
import pandas as pd
import geopy.distance
from sklearn.model_selection import train_test_split
from abc import ABC,abstractmethod
import numpy as np
import sys
import warnings
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame

import sensor_placement as sp


""" Obtain signal sparsity and reconstruct signal at different temporal regimes"""

# perturbate measurements
def add_noise_signal(X:pd.DataFrame,seed:int=92,var:float=1.)->pd.DataFrame:
    """
    Add noise to measurements dataset. The noise ~N(0,var).
    The noise is the same for all sensors during all the time.

    Args:
        X (pd.DataFrame): dataset with measurements
        seed (int): random number generator seed
        var (float): noise variance

    Returns:
        pd.DataFrame: _description_
    """
    rng = np.random.default_rng(seed=seed)
    noise = rng.normal(loc=0.0,scale=var,size=X.shape)
    X_noisy = X + noise
    #X_noisy[X_noisy<0] = 0.
    return X_noisy

# ROI classes
class roi_generator(ABC):
    @abstractmethod
    def generate_rois(self,**kwargs):
        raise NotImplementedError
    
class RandomRoi(roi_generator):
    """ Regions of Interest randomly generated from rng seed"""
    def generate_rois(self,**kwargs)->dict:
        seed = kwargs['seed']
        n = kwargs['n']
        n_regions = kwargs['n_regions']
        rng = np.random.default_rng(seed=seed)    
        indices = np.arange(0,n,1)
        indices_perm = rng.permutation(indices)
        roi_idx = {el:[] for el in np.arange(n_regions)}
        indices_split = np.array_split(indices_perm,n_regions)
        for i in np.arange(n_regions):
            roi_idx[i] = indices_split[i]
        return roi_idx
    
class SubSplitRandomRoi(roi_generator):
    """
    Regions of Interest randomly generated. 
    The indices are randomly generated and then some of them are splitted into new sub regions.
    """
    def generate_rois(self,**kwargs):
        seed = kwargs['seed']
        n = kwargs['n']
        n_regions_original = kwargs['n_regions_original']
        rois_split = kwargs['rois_split']
        n_regions_subsplit = kwargs['n_regions_subsplit']
        seed_subsplit = kwargs['seed_subsplit']
        rng = np.random.default_rng(seed=seed)
        indices = np.arange(0,n,1)
        # first split. Original ROIs
        indices_perm = rng.permutation(indices)
        roi_idx = {el:[] for el in np.arange(n_regions_original)}
        indices_split = np.array_split(indices_perm,n_regions_original)
        for i in np.arange(n_regions_original):
            roi_idx[i] = indices_split[i]
        # second split. Maintain some ROIs and split others
        new_roi_idx = {}
        rng_subsplit = np.random.default_rng(seed=seed_subsplit)
        for i in roi_idx:
            if i in rois_split:
                indices_roi = roi_idx[i]
                indices_roi_perm = rng_subsplit.permutation(indices_roi)
                indices_roi_split = np.array_split(indices_roi_perm,n_regions_subsplit)
                new_dict = {}
                for j in np.arange(n_regions_subsplit):
                    new_dict[float(f'{i}.{j+1}')] = indices_roi_split[j]
                new_roi_idx.update(new_dict)
            else:
                new_roi_idx[i] = roi_idx[i]
            
        return new_roi_idx
            
    
class VarianceRoi(roi_generator):
    def generate_rois(self,**kwargs)->dict:
        coordinate_error_variance_fullymonitored = kwargs['coordinate_error_variance_fullymonitored']
        variance_thresholds = kwargs['variance_thresholds']
        n_regions = kwargs['n_regions']
        print(f'Determining indices that belong to each ROI. {n_regions} regions with thresholds: {variance_thresholds}')
        if type(variance_thresholds) is not list:
            variance_thresholds = [variance_thresholds]
        if len(variance_thresholds) != n_regions:
            raise ValueError(f'Number of variance thresholds: {variance_thresholds} mismatch specified number of regions: {n_regions}')
        roi_idx = {el:[] for el in variance_thresholds}
        for i in range(len(variance_thresholds[:-1])):
            print(f'Variance threshold between {variance_thresholds[i]} and {variance_thresholds[i+1]}')
            stations = [j for j in coordinate_error_variance_fullymonitored[np.logical_and(coordinate_error_variance_fullymonitored>=variance_thresholds[i],coordinate_error_variance_fullymonitored<variance_thresholds[i+1])]]
            print(f'{len(stations)} stations')
            idx_stations = np.where(np.isin(coordinate_error_variance_fullymonitored,stations))[0]
            roi_idx[variance_thresholds[i]] = idx_stations
        stations = [j for j in coordinate_error_variance_fullymonitored[coordinate_error_variance_fullymonitored>=variance_thresholds[-1]]]
        print(f'{len(stations)} stations with a distance larger than {variance_thresholds[-1]}')
        idx_stations = np.where(np.isin(coordinate_error_variance_fullymonitored,stations))[0]
        roi_idx[variance_thresholds[-1]] = idx_stations
        return roi_idx
    
class DistanceRoi(roi_generator):
    def generate_rois(self,**kwargs)->dict:
        """
        Generates Regions of Interest (ROIs) based on distance from certain station

        Args:        
            distances (pd.Series): distance of each location from origin station
            distance_thresholds (list): thresholds for each ROI
            n_regions (int): number of ROIs

        Raises:
            ValueError: Check if number of specified distance thresholds matches number of ROIs

        Returns:
            dict: Indices of each ROI. Key specifies the distance threshold
        """
        distances = kwargs['distances']
        distance_thresholds = kwargs['distance_thresholds']
        n_regions = kwargs['n_regions']
        print(f'Determining indices that belong to each ROI. {n_regions} regions with thresholds: {distance_thresholds}')
        if type(distance_thresholds) is not list:
            distance_thresholds = [distance_thresholds]
        if len(distance_thresholds) != n_regions:
            raise ValueError(f'Number of distance thresholds: {distance_thresholds} mismatch specified number of regions: {n_regions}')
        roi_idx = {el:[] for el in distance_thresholds}
        #distance_thresholds = np.insert(distance_thresholds,0,0)
        for i in range(len(distance_thresholds[:-1])):
            print(f'Distance threshold between {distance_thresholds[i]} and {distance_thresholds[i+1]}')
            stations = [j for j in distances[np.logical_and(distances>=distance_thresholds[i],distances<distance_thresholds[i+1])].index]
            print(f'Stations ({len(stations)}): {stations}')
            idx_stations = np.where(np.isin(distances.index,stations))[0]
            roi_idx[distance_thresholds[i]] = idx_stations
        stations = [j for j in distances[distances>=distance_thresholds[-1]].index]
        print(f'Stations with a distance larger than {distance_thresholds[-1]} ({len(stations)}): {stations}')
        idx_stations = np.where(np.isin(distances.index,stations))[0]
        roi_idx[distance_thresholds[-1]] = idx_stations
        
        return roi_idx


class ROI():
    """
    Region of interest (ROI) class. Select a generator from different roigenerator classes.
    Use as:
        roi = ROI(generator())
        roi.deine_ROIs(**kwargs)
    """
    def __init__(self,generator):
        self._generator = generator
    def define_rois(self,**kwargs)->dict:
        self.roi_idx = self._generator.generate_rois(**kwargs)

# file writer classes
class FileWriter(ABC):
    @abstractmethod
    def save(self,**kwargs):
        raise NotImplementedError

class WriteRandomFile(FileWriter):
    def save(self,results_path,locations,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_locations_monitored = kwargs['n_locations_monitored']
        random_seed = kwargs['random_seed']
        
        fname = f'{results_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_locations_monitored}_randomSeed{random_seed}.pkl'
        with open(fname,'wb') as f:
            pickle.dump(locations,f,protocol=pickle.HIGHEST_PROTOCOL)
        print(f'File saved in {fname}')

class WriteSplitRandomFile(FileWriter):
    def save(self,results_path,locations,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_locations_monitored = kwargs['n_locations_monitored']
        random_seed = kwargs['seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']
        
        fname = f'{results_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_locations_monitored}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        with open(fname,'wb') as f:
            pickle.dump(locations,f,protocol=pickle.HIGHEST_PROTOCOL)
        print(f'File saved in {fname}')

class SaveLocations():
    def __init__(self,writer):
        self._writer = writer
    def save_locations(self,results_path,locations,**kwargs):
        self._writer.save(results_path,locations,**kwargs)

# file reader class
class FileReader(ABC):
    @abstractmethod
    def load(self,**kwargs):
        raise NotImplementedError

class ReadRandomFile(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['signal_threshold_ratio']
        n_sensors = kwargs['n_sensors']
        random_seed = kwargs['random_seed']
        fname = f'{file_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors}_randomSeed{random_seed}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
class ReadSplitRandomFile(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_sensors = kwargs['n_sensors']
        random_seed = kwargs['random_seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']

        fname = f'{file_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
    
class ReadRandomFileBoyd(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        random_seed = kwargs['random_seed']
        n_sensors_Dopt = kwargs['n_sensors_Dopt']
        fname = f'{file_path}SensorsLocations_Boyd_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors_Dopt}_randomSeed{random_seed}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
    
class ReadSplitRandomFileBoyd(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_sensors_Dopt = kwargs['n_sensors_Dopt']
        random_seed = kwargs['random_seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']
        fname = f'{file_path}SensorsLocations_Boyd_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors_Dopt}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        try:
            with open(fname,'rb') as f:
                locations_monitored = np.sort(pickle.load(f))
            print(f'Loaded file {fname}')
        except:
            warnings.warn(f'No file {fname}')
            return 
        return locations_monitored
    
class ReadLocations():
    def __init__(self,reader):
        self._reader = reader
    def load_locations(self,file_path,**kwargs):
        locations_monitored = self._reader.load(file_path,**kwargs)
        return locations_monitored


# signal reconstruction functions
def singular_value_hard_threshold(snapshots_matrix:np.ndarray,sing_vals:np.array,noise:float=-1)->float:
    """
    Compute singular value hard threshold from Gavish-Donoho approximation

    Args:
        snapshots_matrix (np.ndarray): snapshots matrix used for computing SVD
        sing_vals (np.array): corresponding array of singular values
        noise (float,optional): noise () deviation from signal

    Returns:
        float: cut-off index
    """
    beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]
    if noise == -1:#unknown noise
        c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
        omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
        sing_val_threshold = omega*np.median(sing_vals)
        
    else:#known noise
        t1 = 2*(beta+1)
        t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
        lambda_beta = np.sqrt(t1+t2)
        sing_val_threshold = lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))
    
    sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0] + 1
    return sparsity_gd
def signal_reconstruction_svd(U:np.ndarray,snapshots_matrix:np.ndarray,s_range:np.ndarray) -> pd.DataFrame:
    """
    Decompose signal keeping s-first singular vectors using training set data
    and reconstruct validation set.

    Args:
        U (numpy array): left singular vectors matrix
        snapshots_matrix (numpy array): snaphots matrix data.
        s_range (numpy array): list of sparsity values to test

    Returns:
        rmse_sparsity: dataframe containing reconstruction errors at different times for each sparsity threshold in the range
    """
    print(f'Determining signal sparsity by decomposing training set and reconstructing validation set.\nRange of sparsity levels: {s_range}')
    mse_sparsity = pd.DataFrame()
    error_variance_sparsity = pd.DataFrame()
    for s in s_range:
        # projection
        Psi = U[:,:s]
        #snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix_centered) + snapshots_matrix_train.mean(axis=1)[:,None]
        snapshots_matrix_pred_svd = Psi@Psi.T@snapshots_matrix
        
        #RMSE across different signal measurements
        # estimated covariance
        error = snapshots_matrix - snapshots_matrix_pred_svd
        error_variance = error.var(axis=1,ddof=0)# estimated coordiante error variance
        mse = pd.DataFrame((error**2).mean(axis=0),columns=[s])
        error_variance_max = pd.DataFrame([error_variance.max()],index=[s])

        mse_sparsity = pd.concat((mse_sparsity,mse),axis=1)
        error_variance_sparsity = pd.concat((error_variance_sparsity,error_variance_max),axis=0)

    return mse_sparsity

def signal_reconstruction_regression(Psi:np.ndarray,locations_measured:np.ndarray,X_test:pd.DataFrame,X_test_measurements:pd.DataFrame=[],snapshots_matrix_train:np.ndarray=[],snapshots_matrix_test_centered:np.ndarray=[],projected_signal:bool=False,sample_covariance:bool=True)->pd.DataFrame:
    """
    Signal reconstyruction from reduced basis measurement.
    The basis Psi and the measurements are sampled at indices in locations_measured.
    Compute reconstruction error


    Args:
        Psi (np.ndarray): low-rank basis
        locations_measured (np.ndarray): indices of locations measured
        X_test (pd.DataFrame): testing dataset which is measured and used for error estimation
        X_test_measurements (pd.DataFrame): testing dataset measurements projected onto subspace spanned by Psi
        snapshots_matrix_train (np.ndarray): training set snapshots matrix used for computing average
        snapshots_matrix_val_centered (np.ndarray): testing set centered snapshots matrix used for signal reconstruction
        

    Returns:
        rmse (pd.DataFrame): mean reconstruction error between validation data set and reconstructed data
        error_max (pd.DataFrame): max reconstruction error when comparing validation data with reconstructed data
    """
    # basis measurement
    n_sensors_reconstruction = len(locations_measured)
    C = np.identity(Psi.shape[0])[locations_measured]
    Psi_measured = C@Psi
    # regression
    if projected_signal:
        beta_hat = np.linalg.pinv(Psi_measured)@X_test_measurements.iloc[:,locations_measured].T
        snapshots_matrix_predicted = Psi@beta_hat
    else:
        beta_hat = np.linalg.pinv(Psi_measured)@snapshots_matrix_test_centered[locations_measured,:]
        snapshots_matrix_predicted_centered = Psi@beta_hat
        snapshots_matrix_predicted = snapshots_matrix_predicted_centered + snapshots_matrix_train.mean(axis=1)[:,None]
    # compute prediction
    X_pred = pd.DataFrame(snapshots_matrix_predicted.T)
    X_pred.columns = X_test.columns
    X_pred.index = X_test.index
    # compute error metrics
    error = X_test - X_pred
    rmse = pd.DataFrame(np.sqrt(((error)**2).mean(axis=1)),columns=[n_sensors_reconstruction],index=X_test.index)
    error_variance = error.var(axis=0,ddof=0)
    """
    error_max = pd.DataFrame(np.abs(error).max(axis=1),columns=[n_sensors_reconstruction],index=X_test.index)
    error_var = np.zeros(shape = error.shape)
    for i in range(error.shape[0]):
        error_var[i,:] = np.diag(error.iloc[i,:].to_numpy()[:,None]@error.iloc[i,:].to_numpy()[:,None].T)
    error_var = pd.DataFrame(error_var,index=X_test.index,columns=X_test.columns)
    """
    return rmse, error_variance

def hourly_signal_reconstruction(Psi:np.ndarray,X_train:pd.DataFrame,X_val:pd.DataFrame,signal_sparsity:int=1,locations_measured:np.ndarray=[])->dict:
    """
    Compute reconstruction error at different times using low-rank basis
    Args:
        Psi (np.ndarray): monitored low-rank basis
        X_train (pd.DataFrame): training set measurements 
        X_val (pd.DataFrame): validation set measurements
        signal_sparsity (int): sparsity threshold
        locations_measured (np.ndarray): indices of monitored locations

    Returns:
        dict: rmse for multiple measurements at different times
    """
    hours_range = np.sort(X_train.index.hour.unique())
    rmse_time = {el:[] for el in hours_range}
    for h in hours_range:
        # get measurements at certain hour and rearrange as snapshots matrix
        X_train_hour = X_train.loc[X_train.index.hour == h]
        X_val_hour = X_val.loc[X_val.index.hour==h]
        snapshots_matrix_train_hour = X_train_hour.to_numpy().T
        snapshots_matrix_train_hour_centered = snapshots_matrix_train_hour - snapshots_matrix_train_hour.mean(axis=1)[:,None]
        snapshots_matrix_val_hour = X_val_hour.to_numpy().T
        snapshots_matrix_val_hour_centered = snapshots_matrix_val_hour - snapshots_matrix_val_hour.mean(axis=1)[:,None]
        if len(locations_measured) != 0:
            rmse_hour = signal_reconstruction_regression(Psi,locations_measured,snapshots_matrix_train_hour,snapshots_matrix_val_hour_centered,X_val_hour)
        else:# not using sensor placement procedure. Use simple svd reconstruction
            rmse_hour = signal_reconstruction_svd(Psi,snapshots_matrix_train_hour,snapshots_matrix_val_hour_centered,X_val_hour,[signal_sparsity])
        rmse_time[h] = rmse_hour
    return rmse_time

def networkPlanning_iterative(sensor_placement:sp.SensorPlacement,N:int,Psi:np.ndarray,deployed_network_variance_threshold:float,epsilon:float,h_prev:np.ndarray,weights:np.ndarray,n_it:int,locations_monitored:list=[],locations_unmonitored:list=[])->list:
    """
    IRL1 network planning algorithm
    Args:
        sensor_placement (sp.SensorPlacement): sensor placement object containing network information
        N (int): total number of network locations
        deployed_network_variance_threshold (float): error variance threshold for network design
        epsilon (float): IRL1 weights update constant
        h_prev (np.ndarray): network locations initialization
        weights (np.ndarray): IRL1 weights initialization
        n_it (int): IRL1 max iterations
        locations_monitored (list, optional): initialization of set of monitored lcoations. Defaults to [].
        locations_unmonitored (list, optional): initialization of set of unmonitored locaitons. Defaults to [].

    Returns:
        locations (list): indices of monitored and unmonitored locations [S,Sc]
    """
    # iterative method
    it = 0
    time_init = time.time()
    new_monitored = []
    new_unmonitored = []
    while len(locations_monitored) + len(locations_unmonitored) != N:
        # solve sensor placement with constraints
        
        sensor_placement.initialize_problem(Psi,rho=deployed_network_variance_threshold,
                                            w=weights,locations_monitored=locations_monitored,locations_unmonitored=locations_unmonitored)
        sensor_placement.solve()
        print(f'Problem status: {sensor_placement.problem.status}')
        if sensor_placement.problem.status == 'optimal':
            # update sets with new monitored locations
            new_monitored = [i[0] for i in np.argwhere(sensor_placement.h.value >= 1-epsilon) if i[0] not in locations_monitored]
            new_unmonitored = [i[0] for i in np.argwhere(sensor_placement.h.value <= epsilon) if i[0] not in locations_unmonitored]

            locations_monitored += new_monitored
            locations_unmonitored += new_unmonitored
            # check convergence
            if np.linalg.norm(sensor_placement.h.value - h_prev)<=epsilon or it==n_it:
                locations_monitored += [[i for i in np.argsort(sensor_placement.h.value)[::-1] if i not in locations_monitored][0]]
                it = 0
            h_prev = sensor_placement.h.value
            weights_old = weights.copy()
            weights = 1/(h_prev + epsilon)
            it +=1
        else:
            # solver fails at iteration
            #locations_monitored = locations_monitored[:-len(new_monitored)]
            if len(new_unmonitored) != 0:
                locations_unmonitored = locations_unmonitored[:-len(new_unmonitored)]
                weights = weights_old
            it+=1

        print(f'{len(locations_monitored)} Locations monitored: {locations_monitored}\n{len(locations_unmonitored)} Locations unmonitored: {locations_unmonitored}\n')
    time_end = time.time()
    locations = [locations_monitored,locations_unmonitored]
    print(f'IRL1 algorithm finished in {time_end-time_init:.2f}s.')
    return locations

#%% dataset
class Dataset():
    def __init__(self,pollutant:str='O3',N:int=44,start_date:str='2011-01-01',end_date:str='2022-12-31',files_path:str='',synthetic_dataset:bool=False):
        self.pollutant = pollutant
        self.N = N
        self.start_date = start_date
        self.end_date = end_date
        self.files_path = files_path
        self.synthetic_dataset = synthetic_dataset
    
    def load_dataset(self):
        if self.synthetic_dataset:
            fname = f'{self.files_path}SyntheticData_{self.start_date}_{self.end_date}.csv'
        else:
            fname = f'{self.files_path}{self.pollutant}_catalonia_clean_N{self.N}_{self.start_date}_{self.end_date}.csv'
            self.stations_types = pd.read_csv(f'{self.files_path}stations_types.csv',index_col=0)
            self.coordinates = pd.read_csv(f'{self.files_path}coordinates.csv',index_col=0)
            self.coordinates_distances = pd.DataFrame([],index=self.coordinates.index,columns=self.coordinates.index)
            for i in range(self.coordinates.shape[0]):
                for j in range(self.coordinates.shape[0]):
                    self.coordinates_distances.iloc[i,j] = geopy.distance.geodesic(self.coordinates.iloc[i,:],self.coordinates.iloc[j,:]).km
            
        print(f'Loading dataset from {fname}')
        self.ds = pd.read_csv(fname,sep=',',index_col=0)
        self.ds.index = pd.to_datetime(self.ds.index)
        

    def check_dataset(self):
        print(f'Checking missing values in dataset')
        print(f'Percentage of missing values per location:\n{100*self.ds.isna().sum()/self.ds.shape[0]}')
        print(f'Dataset has {self.ds.shape[0]} measurements for {self.ds.shape[1]} locations.\n{self.ds.head()}')

    def sort_stations(self,station_center='Ciutadella'):
        """ Sort order of stations based on distance to one of them"""
        if station_center not in [i for i in self.coordinates_distances.columns]:
            raise ValueError(f'Station used for center is not present in dataset')

        self.distances = dataset.coordinates_distances.loc[station_center]
        self.distances.sort_values(ascending=True,inplace=True)
        self.ds = self.ds.loc[:,[f'O3_{i}' for i in self.distances.index if f'O3_{i}' in self.ds.columns]]
        print(f'Order of dataset locations: {self.ds.columns}')

In [None]:
# figures
class Figures():
    def __init__(self,save_path,figx=2.5,figy=2.5,fs_title=10,fs_label=10,fs_ticks=10,fs_legend=10,marker_size=3,dpi=300,use_grid=False,show_plots=False):
        self.figx = figx
        self.figy = figy
        self.fs_title = fs_title
        self.fs_label = fs_label
        self.fs_ticks = fs_ticks
        self.fs_legend = fs_legend
        self.marker_size = marker_size
        self.dpi = dpi
        self.save_path = save_path
        if show_plots:
            self.backend = 'Qt5Agg'
        else:
            self.backend = 'Agg'
        
        print('Setting mpl rcparams')
        
        font = {'weight':'normal',
                'size':str(self.fs_label),
                }
        
        lines = {'markersize':self.marker_size}
        
        fig = {'figsize':[self.figx,self.figy],
               'dpi':self.dpi
               }
        
        ticks={'labelsize':self.fs_ticks
            }
        axes={'labelsize':self.fs_ticks,
              'grid':False,
              'titlesize':self.fs_title
            }
        if use_grid:
            grid = {'alpha':0.5}
            mpl.rc('grid',**grid)
        
        mathtext={'default':'regular'}
        legend = {'fontsize':self.fs_legend}
        
        mpl.rc('font',**font)
        mpl.rc('figure',**fig)
        mpl.rc('xtick',**ticks)
        mpl.rc('ytick',**ticks)
        mpl.rc('axes',**axes)
        mpl.rc('legend',**legend)
        mpl.rc('mathtext',**mathtext)
        mpl.rc('lines',**lines)        
        mpl.use(self.backend)

    def curve_timeseries_singlestation(self,X:pd.DataFrame,station_name:str,date_init:str='2020-01-20',date_end:str='2021-10-27'):
        date_range = pd.date_range(start=date_init,end=date_end,freq='H')
        date_idx = [i for i in date_range if i in X.index]
        data = X.loc[date_idx,[station_name]]
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.set_xlabel('date')
        ax.set_ylabel('Concentration ($\mu$g/$m^3$)')
        fig.tight_layout()

    def curve_timeseries_allstations(self,X:pd.DataFrame,date_init:str='2020-01-20',date_end:str='2021-10-27',save_fig=False):
        date_range = pd.date_range(start=date_init,end=date_end,freq='H')
        date_idx = [i for i in date_range if i in X.index]
        data = X.loc[date_idx]
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.fill_between(x=data.index,y1=np.percentile(X,axis=1,q=25),y2=np.percentile(X,axis=1,q=75))
        ax.set_xlabel('date')
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        fig.tight_layout()

        if save_fig:
            fname = self.save_path+'timeseries_Allstations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')

    
    def curve_timeseries_dailypattern_singlestation(self,X:pd.DataFrame,station_name:str):
        X_ = X.loc[:,station_name].copy()
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.fill_between(x=data.index,y1=q1,y2=q3,alpha=0.5)
        ax.set_xlabel('hour')
        yrange = np.arange(0,110,10)
        ax.set_yticks(yrange)
        ax.set_yticklabels([i for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        ax.set_ylim(0,100)
        fig.tight_layout()
    
    def curve_timeseries_dailypattern_multiplestations(self,X:pd.DataFrame,stations_locs:list=[0,1,2,3],save_fig:bool=False):
        stations_names = [i for i in X.columns[stations_locs]]
        colors = ['#1a5276','orange','#117864','#943126']
        X_ = X.iloc[:,stations_locs].copy()
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)

        
        fig = plt.figure()
        curves = {}
        for i in range(len(stations_locs)):
            ax = fig.add_subplot(221+i)
            curves[i] = ax.plot(data.iloc[:,i],label=stations_names[i],color=colors[i])
            ax.fill_between(x=data.index,y1=q1.iloc[:,i],y2=q3.iloc[:,i],alpha=0.5,color=colors[i])
            yrange = np.arange(0,110,10)
            ax.set_yticks(yrange)
            ax.set_yticklabels([i for i in ax.get_yticks()])    
            if (221+i)%2 == 1:
                ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
            ax.set_ylim(0,100)
            if i in [2,3]:
                ax.set_xlabel('hour')

        handles = [curves[i][0] for i in curves.keys()]
        fig.legend(handles=[i for i in handles],ncol=2,bbox_to_anchor=(0.95,1.15),framealpha=1)
        fig.tight_layout()

        if save_fig:
            fname = f'{self.save_path}Curve_TimeSeriesHourly_ManyStations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved into {fname}')
        
    def curve_timeseries_dailypattern_allstations(self,X:pd.DataFrame):
        X_ = pd.DataFrame()
        for c in X.columns:
            X_ = pd.concat((X_,X.loc[:,c]),axis=0)
        X_ = X_.loc[:,0]
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.fill_between(x=data.index,y1=q1,y2=q3,alpha=0.5)
        ax.set_xlabel('hour')
        yrange = np.arange(0,110,10)
        ax.set_yticks(yrange)
        ax.set_yticklabels([i for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        ax.set_ylim(0,100)
        fig.tight_layout()

    def boxplot_measurements(self,X,save_fig):
        n = X.shape[1]
        yrange = np.arange(0.0,300,50)
        xrange = np.arange(1,n+1,1)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=X,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[i for i in range(len(xrange))],widths=0.5,labels=[str(i) for i in xrange],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        
        xrange = [i-1 for i in xrange if i%5==0]
        ax.set_xticks(xrange)
        ax.set_xticklabels([int(i+1) for i in xrange],rotation=0)
        ax.set_xlabel('Location index')
        fig.tight_layout()
        if save_fig:
            fname = self.save_path+'boxplot_concentration_allStations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')

    def geographical_network_visualization(self,map_path:str,df_coordinates:pd.DataFrame,locations_monitored:np.array=[],roi_idx:dict={},show_legend:bool=False,show_deployed_sensors:bool=True,save_fig:bool=False)->plt.figure:
        """
        Figure showing the geographical area where sensors are deployed along with coordinates of reference stations

        Args:
            map_path (str): path to map file
            df_coordinates (pd.DataFrame): dataframe containing coordiantes(Latitude,Longitude) of each reference station
            locations_monitored (np.array, optional): indices of monitored locations. Defaults to [].
            roi_idx (dict): dictionary indicating indices that belong to each region of interest (ROI) in case of heterogeneous design. The keys correspond to parameter used for separating ROIs.
            show_legend (bool, optional): Show legend indicating monitored and unmonitored locations. Defaults to False.
            save_fig (bool, optional): save generated figure. Defaults to False.

        Returns:
            plt.figure: Figure with map and stations 
        """
        
        if len(locations_monitored)!=0:
            df_coords_monitored = df_coordinates.iloc[locations_monitored]
            df_coords_unmonitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i not in locations_monitored]]
            geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
            geometry_unmonitored = [Point(xy) for xy in zip(df_coords_unmonitored['Longitude'], df_coords_unmonitored['Latitude'])]
            gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
            gdf_unmonitored = GeoDataFrame(df_coords_unmonitored, geometry=geometry_unmonitored)

        else:
            df_coords_monitored = df_coordinates.copy()
            geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
            gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
        
        spain = gpd.read_file(f'{map_path}ll_autonomicas_inspire_peninbal_etrs89.shp')
        catalonia = spain.loc[spain.NAME_BOUND.str.contains('Catalunya')]
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        geo_map = catalonia.plot(ax=ax,color='#117a65')
        
        try:
            if len(roi_idx)!=0:
                markers = ['^','o','s','P','D']
                colors = ['k','#943126']
                if show_deployed_sensors:
                    print('Map showing monitored and unmonitored locations for each ROI')
                    for i,idx,m in zip(range(len(roi_idx)),roi_idx.values(),markers):
                        #locations_monitored_roi = np.array(locations_monitored)[np.isin(locations_monitored,idx)]
                        locations_monitored_roi = np.array([i for i in locations_monitored if i in idx])
                        locations_unmonitored_roi = np.array([i for i in range(df_coordinates.shape[0]) if i not in locations_monitored and i in idx])
                        print(f'locations monitored for ROI {i}: {len(locations_monitored_roi)}\nlocations unmonitored for ROI {i}: {len(locations_unmonitored_roi)}')
                        # monitored locations in ROI
                        df_coords_monitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in locations_monitored_roi]]
                        geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
                        gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
                        gdf_monitored.plot(ax=geo_map, marker=m, color=colors[1], markersize=6,label=f'$\mathcal{{R}}_{i+1}{{\cap}}\mathcal{{S}}$')
                        
                        # unmonitored locations in ROI
                        df_coords_unmonitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in locations_unmonitored_roi]]
                        print(f'Shape of unmonitored dataframe coordinates: {df_coords_unmonitored.shape}')
                        geometry_unmonitored = [Point(xy) for xy in zip(df_coords_unmonitored['Longitude'], df_coords_unmonitored['Latitude'])]
                        gdf_unmonitored = GeoDataFrame(df_coords_unmonitored, geometry=geometry_unmonitored)
                        gdf_unmonitored.plot(ax=geo_map, marker=m, color=colors[0], markersize=6,label=f'$\mathcal{{R}}_{i+1}{{\cap}}\mathcal{{S}}^{{c}}$') 

                else: # show icons belonging to each ROI
                    for i,idx,m,c in zip(range(len(roi_idx)),roi_idx.values(),markers,colors):
                        
                        df_coords_idx = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in idx]]
                        geometry_idx = [Point(xy) for xy in zip(df_coords_idx['Longitude'], df_coords_idx['Latitude'])]
                        gdf_monitored = GeoDataFrame(df_coords_idx, geometry=geometry_idx)
                        gdf_monitored.plot(ax=geo_map, marker=m, color=c, markersize=6,label=f'$\mathcal{{R}}_{i+1}$')
                
            else:
                gdf_monitored.plot(ax=geo_map, marker='o', color='#943126', markersize=6,label=f'Monitoring node')
                gdf_unmonitored.plot(ax=geo_map, marker='o', color='k', markersize=6,label=f'Unmonitored locations')
        except:
            warnings.warn('No unmonitored locations or unexpected error in dataframe')
        ax.set_xlim(0.0,4.0)
        ax.set_ylim(40.5,43)
        
        ax.set_ylabel('Latitude (degrees)')
        ax.set_xlabel('Longitude (degrees)')

        # set legend location
        if show_legend:
            if show_deployed_sensors:
                if len(roi_idx) == 2:
                    ax.legend(loc='center',ncol=len(roi_idx),framealpha=0,
                              handletextpad=-0.8,columnspacing=5e-4,labelspacing=0.1,bbox_to_anchor=(0.73,0.1))
                elif len(roi_idx)==3:
                    ax.legend(loc='center',ncol=len(roi_idx),framealpha=0,
                              handletextpad=-0.8,columnspacing=1e-6,labelspacing=0.05,bbox_to_anchor=(0.6,0.1))
            else:
                ax.legend(loc='lower right',ncol=1,framealpha=0.1,handletextpad=-0.1,columnspacing=0.5)
        ax.tick_params(axis='both', which='major')
        fig.tight_layout()
        
        # save generated figure
        if save_fig:
            if show_deployed_sensors:
                fname = self.save_path+f'Map_PotentialLocations_{len(roi_idx)}ROIs.png'
            else:
                if len(roi_idx)!=0:
                    fname = self.save_path+f'Map_PotentialLocations_{len(roi_idx)}ROIs.png'
                else:
                    fname = self.save_path+f'Map_PotentialLocations.png'
            fig.savefig(fname,dpi=600,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')
        return fig
        

    # Low-rank plots
    def singular_values_cumulative_energy(self,sing_vals,n,synthetic_dataset=False,save_fig=False):
        """
        Plot sorted singular values ratio and cumulative energy

        Parameters
        ----------
        sing_vals : numpy array
            singular values
        n : int
            network size
        save_fig : bool, optional
            save generated figures. The default is False.

        Returns
        -------
        None.

        """
        cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
        xrange = np.arange(0,sing_vals.shape[0],1)
        fig1 = plt.figure()
        ax = fig1.add_subplot(111)
        ax.plot(xrange,cumulative_energy,color='#1f618d',marker='o')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()])
        ax.set_xlabel('$i$th singular value')
        
        #yrange = np.arange(0.5,1.05,0.05)
        yrange = np.arange(0.,1.2,0.2)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        ax.set_ylabel('Cumulative energy')
        if synthetic_dataset:
            ax.set_yscale('log')
        fig1.tight_layout()
        
        fig2 = plt.figure()
        ax = fig2.add_subplot(111)
        ax.plot(xrange, sing_vals / np.max(sing_vals),color='#1f618d',marker='o')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()],rotation=0)
        ax.set_xlabel('$i$th singular value')

        yrange = np.logspace(-4,0,5)
        ax.set_yticks(yrange)
        ax.set_ylabel('Normalized singular values')
        ax.set_ylim(1e-2,1)
        ax.set_yscale('log')
        if synthetic_dataset:
            ax.set_yscale('log')
        fig2.tight_layout()
        
        if save_fig:
            fname = self.save_path+f'Curve_sparsity_cumulativeEnergy_N{n}.png'
            fig1.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at: {fname}')

            fname = self.save_path+f'Curve_sparsity_singularValues_N{n}.png'
            fig2.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at: {fname}')
    
    def singular_values_cumulative_energy_sameFigure(self,sing_vals,n,save_fig=False):
        """
        Plot sorted singular values ratio and cumulative energy in the same figure

        Parameters
        ----------
        sing_vals : numpy array
            singular values
        n : int
            network size
        save_fig : bool, optional
            save generated figures. The default is False.

        Returns
        -------
        None.

        """
        cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
        xrange = np.arange(0,sing_vals.shape[0],1)
        fig = plt.figure(constrained_layout=True)
        ax = fig.add_subplot(111)

        l1 = ax.plot(xrange, sing_vals / np.max(sing_vals),color='#ba4a00',marker='o',label='Normalized singular values')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()],rotation=0)
        ax.set_xlabel('$i$th singular value')
        yrange = np.logspace(-4,0,5)
        ax.set_yticks(yrange)
        ax.set_ylabel('Normalized singular values')
        ax.set_ylim(1e-2,1)
        ax.set_yscale('log')

        ax2 = ax.twinx()
        l2 = ax2.plot(xrange,cumulative_energy,color='#1f618d',marker='o',label='Cumulative energy')
        ax2.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax2.set_xticklabels([int(i+1) for i in ax2.get_xticks()])
        
        yrange = np.arange(0.,1.2,0.2)
        ax2.set_yticks(yrange)
        ax2.set_yticklabels([np.round(i,2) for i in ax2.get_yticks()])
        #ax2.set_ylabel('Cumulative energy')
        ax2.set_ylim(0,1)
        
        lines = l1+l2
        labels = [l.get_label() for l in lines]
        #ax.legend(lines,labels,loc='center',ncol=1,framealpha=1.,bbox_to_anchor=(0.5,1.15),handlelength=0.5,handletextpad=0.1)
        #fig.tight_layout()
        
        if save_fig:
            fname = self.save_path+f'Curve_singVals_cumulativeEnergy_N{n}.png'
            fig.savefig(fname,dpi=600,format='png',bbox_inches='tight')
            print(f'Figure saved at: {fname}')


    def boxplot_validation_rmse_svd(self,rmse_sparsity,n,max_sparsity_show=10,synthetic_dataset=False,save_fig=False) -> plt.figure:
        yrange = np.arange(0.0,35,5)
        xrange = rmse_sparsity.columns[:max_sparsity_show]
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=rmse_sparsity.iloc[:,:max_sparsity_show],notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[i for i in range(len(xrange))],widths=0.5,labels=[str(i) for i in xrange],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        if synthetic_dataset:
            ax.set_yscale('log')
            ax.set_ylim(1e-2,1e1)
        else:
            ax.set_ylim(0,30)
        ax.set_ylabel('RMSE ($\mu$g/$m^3$)')
        xrange = np.array([i-1 for i in xrange if i%5==0])
        ax.set_xticks(xrange)
        ax.set_xticklabels([int(i+1) for i in xrange],rotation=0)
        ax.set_xlabel('Sparsity level')
        fig.tight_layout()

        if save_fig:
            fname = self.save_path+f'boxplot_RMSE_SVDreconstruction_validationSet_Smin{xrange.min()}_Smax{xrange.max()}_N{n}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved in {fname}')
    
        return fig
    
    def boxplot_rmse_comparison(self,rmse_method1:pd.DataFrame,rmse_method2:pd.DataFrame,maxerror:bool=False,save_fig:bool=False)->plt.figure:
        """
        Boxplot comparing validation set RMSE using 2 different numbers of deployed senors.
        E.g: compare fully monitored vs reduced

        Args:
            rmse_method1 (pd.DataFrame): rmse for certain number of sensors
            rmse_method2 (pd.DataFrame): rmse for different number of sensors (for example fully monitored)
            maxerror (bool, optional): dataframes contain maximum reconstruction error instead of RMSE. Defaults to False.
            save_fig (bool, optional): Save generqated figure. Defaults to False.

        Returns:
            plt.figure: Figure
        """
        n_sensors_1 = rmse_method1.columns[0]
        n_sensors_2 = rmse_method2.columns[0]

        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp1 = ax.boxplot(x=rmse_method1,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[0],widths=0.5,labels=[n_sensors_1],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        bp2 = ax.boxplot(x=rmse_method2,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[1],widths=0.5,labels=[n_sensors_2],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        bp1['boxes'][0].set_facecolor('lightgreen')
        bp2['boxes'][0].set_facecolor('#1a5276')
        
        if maxerror:
            yrange = np.arange(0.,55.,5)
            ax.set_ylim(0,50)
        else:
            yrange = np.arange(0.,22.,2)
            ax.set_ylim(0,20)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])

        if maxerror:
            ax.set_ylabel('Max error ($\mu$g/$m^3$)')        
        else:
            ax.set_ylabel('RMSE ($\mu$g/$m^3$)')        
        ax.set_xlabel('Number of deployed sensors')
        fig.tight_layout()

        if save_fig:
            if maxerror:
                fname = f'{self.save_path}Maxerrorcomparison_NsensorsTotal_N1{n_sensors_1}_N2{n_sensors_2}.png'
            else:
                fname = f'{self.save_path}RMSEcomparison_NsensorsTotal_N1{n_sensors_1}_N2{n_sensors_2}.png'
            fig.savefig(fname,dpi=300,format='png')
    
        return fig
    
    def boxplot_errorratio(self,df_error1:pd.DataFrame,df_error2:pd.DataFrame,save_fig:bool=False)->plt.figure:
        n_sensors1 = df_error1.columns[0]
        n_sensors2 = df_error2.columns[0]
        df_ratio = df_error1.to_numpy() / df_error2.to_numpy()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=df_ratio,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[0],widths=0.5,labels=[f'{n_sensors1} sensors vs {n_sensors2} senors'],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        
        bp['boxes'][0].set_facecolor('#1a5276')
        
        yrange = np.arange(0.,3.5,0.5)
        ax.set_ylim(0,3)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])

        ax.set_ylabel('Reconstruction errors ratio')        
        ax.set_xlabel('')
        fig.tight_layout()

        if save_fig:
            fname = f'{self.save_path}ErrorRatio_NsensorsTotal_N1{n_sensors1}_N2{n_sensors2}.png'
            fig.savefig(fname,dpi=300,format='png')
    
        return fig
    
    def hist_worsterror(self,errormax_fullymonitored,errormax_reconstruction,n_sensors,save_fig=False):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(x=errormax_fullymonitored,bins=np.arange(0.,5.1,0.1),density=True,cumulative=False,color='#1a5276',label='Fully monitored network')
        ax.vlines(x=errormax_fullymonitored.mean(),ymin=0.0,ymax=1.0,colors='#1a5276',linestyles='--')
        ax.hist(x=errormax_reconstruction,bins=np.arange(0.,5.1,0.1),density=True,cumulative=False,color='orange',label=f'Reconstruction with {n_sensors} sensors',alpha=0.5)
        ax.vlines(x=errormax_reconstruction.mean(),ymin=0.0,ymax=1.0,colors='orange',linestyles='--')
        ax.set_xlabel('Maximum reconstruction error')
        ax.set_ylabel('Probability density')
        ax.legend(loc='upper left',ncol=1,framealpha=0.5)
        ax.set_xlim(0,5)
        ax.set_ylim(0,1)
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}Histogram_error_fullymonitored_vs_reconstruction_Nsensors{n_sensors}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at {fname}')

    def hist_errorratio(self,errormax_fullymonitored,errormax_reconstruction,n_sensors,save_fig=False):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(x=errormax_reconstruction.to_numpy()/errormax_fullymonitored.to_numpy(),bins=np.arange(0,3.1,0.1),density=True,cumulative=False,color='#1a5276')
        ax.set_xlabel('Maximum error ratio')
        ax.set_ylabel('Probability density')
        ax.set_xlim(0,3)
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}Histogram_errorRatio_Nsensors{n_sensors}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at {fname}')
    
    def curve_errorvariance_comparison(self,errorvar_fullymonitored:list,errorvar_reconstruction:list,variance_threshold_ratio:float,worst_coordinate_variance_fullymonitored:float,n:int,n_sensors:int,errorvar_reconstruction_Dopt:list=[],roi_idx:dict={},n_sensors_Dopt:int=0,method:str='random_based',random_seed:int=0,save_fig:bool=False) -> plt.figure:
        """
        Show error variance over a testing set at each network location. 
        The error variance is obtained after reconstructing the signal from p measurements.
        The p measurement locations are obtained from network design algorithm or D-optimality criteria.
        It also shows the threshold line which the network design algorithm used.
        Another algorithm can be shown for comparison.

        Args:
            errorvar_fullymonitored (list): error variance at each network location obtained with a fully monitored network. This corresponds to the lowest error variance possible.
            errorvar_reconstruction (list): error variance at each network locations obtained with a network with a reduced number of deployed sensors.
            variance_threshold_ratio (float): variance threshold ratio used for design algorithm. It is a multiple of the worst_coordinate_variance_fullymonitored.
            worst_coordinate_variance_fullymonitored (float): fully-monitored network worst coordinate error variance
            n (int): total number of network points
            n_sensors (int): number of deployed sensors
            errorvar_reconstruction_Dopt (list): error variance at each network location obtained by D-optimality (or other) criteria. Defaults to [].
            roi_idx (dict): dictionary containing indices of locations that belong to each ROI. The keys indicate the threshold used to separate the network.
            save_fig (bool, optional): Save generated figure. Defaults to False.

        Returns:
            plt.figure: Figure with error variance curves
        """
        if type(variance_threshold_ratio) is float:
            variance_threshold = variance_threshold_ratio*worst_coordinate_variance_fullymonitored
        
            fig = plt.figure()
            ax = fig.add_subplot(111)
            ax.plot(errorvar_fullymonitored,color='#1d8348',label='Fully monitored network')
            if len(errorvar_reconstruction_Dopt) !=0:
                ax.plot(errorvar_reconstruction_Dopt,color='orange',label=f'Joshi-Boyd solution',alpha=0.8)
            ax.plot(errorvar_reconstruction,color='#1a5276',label=f'Network design solution')
            ax.hlines(y=variance_threshold,xmin=0,xmax=n+1,color='k',linestyles='--',label=rf'Design threshold $\rho$={variance_threshold_ratio:.2f}$\rho_n$')
            xrange = np.arange(-1,n,10)
            xrange[0] = 0
            ax.set_xticks(xrange)
            ax.set_xticklabels([i+1 for i in ax.get_xticks()])
            ax.set_xlim(0,n)
            ax.set_xlabel('Location index')
            yrange = np.arange(0,1.75,0.25)
            ax.set_yticks(yrange)
            ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
            ax.set_ylim(0,1.5)
            ax.set_ylabel('Error variance')
            ax.legend(loc='center',ncol=2,framealpha=0.5,bbox_to_anchor=(0.5,1.1))
            fig.tight_layout()
            if save_fig:
                fname = f'{self.save_path}Curve_errorVariance_Threshold{variance_threshold_ratio:.2f}_Nsensors{n_sensors}.png'
                fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
                print(f'Figure saved at {fname}')


        else: # heterogeneous thresholds over multiple ROIs
            variance_threshold = [t*w for t,w in zip(variance_threshold_ratio,worst_coordinate_variance_fullymonitored)]
            # sort coordinate error variance such that the ROIs are shown in order
            coordinate_error_variance_fully_monitored_sorted = np.concatenate([errorvar_fullymonitored[i] for i in roi_idx.values()])
            coordinate_error_variance_design_sorted = np.concatenate([errorvar_reconstruction[i] for i in roi_idx.values()])

            fig = plt.figure(constrained_layout=True)
            ax = fig.add_subplot(111)
            # coordinate error variance at each location
            ax.plot(coordinate_error_variance_fully_monitored_sorted,color='#943126',label='Fully monitored case')
            # horizontal lines showing threshold design
            n_roi = np.concatenate([[0],[len(i) for i in roi_idx.values()]])
            n_roi_cumsum = np.cumsum(n_roi)
            for v,l in zip(variance_threshold,range(len(n_roi_cumsum))):
                if l==0:
                    ax.hlines(y=v,xmin=n_roi_cumsum[l]-1,xmax=n_roi_cumsum[l+1]-1,color='k',linestyles='--',label='Design threshold')
                else:
                    ax.hlines(y=v,xmin=n_roi_cumsum[l],xmax=n_roi_cumsum[l+1]-1,color='k',linestyles='--')
            
            # Joshi Boyd and IRNet results
            if len(errorvar_reconstruction_Dopt) !=0:
                coordinate_error_variance_Dopt_sorted = np.concatenate([errorvar_reconstruction_Dopt[i] for i in roi_idx.values()])
                ax.plot(coordinate_error_variance_Dopt_sorted,color='orange',label=f'JB {n_sensors_Dopt} sensors',alpha=0.8)
            ax.plot(coordinate_error_variance_design_sorted,color='#1a5276',label=f'IRWNet {n_sensors} sensors')
            
            xrange = np.arange(-1,n,10)
            xrange[0] = 0
            ax.set_xticks(xrange)
            ax.set_xticklabels([i+1 for i in ax.get_xticks()])
            ax.set_xlim(-0.5,n)
            ax.set_xlabel('Location index')
            yrange = np.arange(0,3.5,0.5)
            ax.set_yticks(yrange)
            ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
            ax.set_ylim(0,3.0+0.1)
            ax.set_ylabel('Per-coordinate error variance')
            ax.legend(loc='center',ncol=2,framealpha=1,
                      handlelength=0.5,handletextpad=0.1,columnspacing=0.2,
                      bbox_to_anchor=(0.5,0.88))
            #fig.tight_layout()
            if save_fig:
                #fname = f'{self.save_path}Curve_errorVariance_Threshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}_NsensorsROIDopt_{n_sensors_roi}.png'
                if method == 'random_based':
                    fname = f'{self.save_path}Curve_errorVariance_VarThreshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}_randomSeed{random_seed}.png'
                else:
                    fname = f'{self.save_path}Curve_errorVariance_VarThreshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}.png'
                fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
                print(f'Figure saved at {fname}')


    def curve_rmse_hourly(self,rmse_time,month=0,save_fig=False):
        hours = [i for i in rmse_time.keys()]
        median = [rmse_time[i].median().to_numpy()[0] for i in hours]
        q1,q3 = [rmse_time[i].quantile(q=0.25).to_numpy()[0] for i in hours], [rmse_time[i].quantile(q=0.75).to_numpy()[0] for i in hours]

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(median,color='#1a5276')
        ax.fill_between(x=hours,y1=q1,y2=q3,color='#1a5276',alpha=0.5)
        ax.set_xticks(hours[::4])
        ax.set_xticklabels([i for i in ax.get_xticks()])
        ax.set_xlabel('Hour')
        yrange = np.arange(0,12.,2.)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])
        ax.set_ylabel('RMSE ($\mu$g/$m^3$)')
        ax.set_ylim(yrange[0],yrange[-1])
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}deploy_sensors_hourly_month{month}.png'
            fig.savefig(fname,dpi=300,format='png')
        return fig

In [None]:
pollutant = 'O3'
start_date = '2011-01-01'
end_date = '2022-12-31'
N=48
dataset = Dataset(pollutant,N,start_date,end_date,files_path)
dataset.load_dataset()
dataset.check_dataset()
dataset.sort_stations(station_center='Ciutadella')

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv
Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juned

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 17 17:21:04 2023

@author: jparedes
"""
import os
import time
import pandas as pd
import geopy.distance
from sklearn.model_selection import train_test_split
from abc import ABC,abstractmethod
import numpy as np
import sys
import warnings
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame

import sensor_placement as sp


""" Obtain signal sparsity and reconstruct signal at different temporal regimes"""

# perturbate measurements
def add_noise_signal(X:pd.DataFrame,seed:int=92,var:float=1.)->pd.DataFrame:
    """
    Add noise to measurements dataset. The noise ~N(0,var).
    The noise is the same for all sensors during all the time.

    Args:
        X (pd.DataFrame): dataset with measurements
        seed (int): random number generator seed
        var (float): noise variance

    Returns:
        pd.DataFrame: _description_
    """
    rng = np.random.default_rng(seed=seed)
    noise = rng.normal(loc=0.0,scale=var,size=X.shape)
    X_noisy = X + noise
    #X_noisy[X_noisy<0] = 0.
    return X_noisy

# ROI classes
class roi_generator(ABC):
    @abstractmethod
    def generate_rois(self,**kwargs):
        raise NotImplementedError
    
class RandomRoi(roi_generator):
    """ Regions of Interest randomly generated from rng seed"""
    def generate_rois(self,**kwargs)->dict:
        seed = kwargs['seed']
        n = kwargs['n']
        n_regions = kwargs['n_regions']
        rng = np.random.default_rng(seed=seed)    
        indices = np.arange(0,n,1)
        indices_perm = rng.permutation(indices)
        roi_idx = {el:[] for el in np.arange(n_regions)}
        indices_split = np.array_split(indices_perm,n_regions)
        for i in np.arange(n_regions):
            roi_idx[i] = indices_split[i]
        return roi_idx
    
class SubSplitRandomRoi(roi_generator):
    """
    Regions of Interest randomly generated. 
    The indices are randomly generated and then some of them are splitted into new sub regions.
    """
    def generate_rois(self,**kwargs):
        seed = kwargs['seed']
        n = kwargs['n']
        n_regions_original = kwargs['n_regions_original']
        rois_split = kwargs['rois_split']
        n_regions_subsplit = kwargs['n_regions_subsplit']
        seed_subsplit = kwargs['seed_subsplit']
        rng = np.random.default_rng(seed=seed)
        indices = np.arange(0,n,1)
        # first split. Original ROIs
        indices_perm = rng.permutation(indices)
        roi_idx = {el:[] for el in np.arange(n_regions_original)}
        indices_split = np.array_split(indices_perm,n_regions_original)
        for i in np.arange(n_regions_original):
            roi_idx[i] = indices_split[i]
        # second split. Maintain some ROIs and split others
        new_roi_idx = {}
        rng_subsplit = np.random.default_rng(seed=seed_subsplit)
        for i in roi_idx:
            if i in rois_split:
                indices_roi = roi_idx[i]
                indices_roi_perm = rng_subsplit.permutation(indices_roi)
                indices_roi_split = np.array_split(indices_roi_perm,n_regions_subsplit)
                new_dict = {}
                for j in np.arange(n_regions_subsplit):
                    new_dict[float(f'{i}.{j+1}')] = indices_roi_split[j]
                new_roi_idx.update(new_dict)
            else:
                new_roi_idx[i] = roi_idx[i]
            
        return new_roi_idx
            
    
class VarianceRoi(roi_generator):
    def generate_rois(self,**kwargs)->dict:
        coordinate_error_variance_fullymonitored = kwargs['coordinate_error_variance_fullymonitored']
        variance_thresholds = kwargs['variance_thresholds']
        n_regions = kwargs['n_regions']
        print(f'Determining indices that belong to each ROI. {n_regions} regions with thresholds: {variance_thresholds}')
        if type(variance_thresholds) is not list:
            variance_thresholds = [variance_thresholds]
        if len(variance_thresholds) != n_regions:
            raise ValueError(f'Number of variance thresholds: {variance_thresholds} mismatch specified number of regions: {n_regions}')
        roi_idx = {el:[] for el in variance_thresholds}
        for i in range(len(variance_thresholds[:-1])):
            print(f'Variance threshold between {variance_thresholds[i]} and {variance_thresholds[i+1]}')
            stations = [j for j in coordinate_error_variance_fullymonitored[np.logical_and(coordinate_error_variance_fullymonitored>=variance_thresholds[i],coordinate_error_variance_fullymonitored<variance_thresholds[i+1])]]
            print(f'{len(stations)} stations')
            idx_stations = np.where(np.isin(coordinate_error_variance_fullymonitored,stations))[0]
            roi_idx[variance_thresholds[i]] = idx_stations
        stations = [j for j in coordinate_error_variance_fullymonitored[coordinate_error_variance_fullymonitored>=variance_thresholds[-1]]]
        print(f'{len(stations)} stations with a distance larger than {variance_thresholds[-1]}')
        idx_stations = np.where(np.isin(coordinate_error_variance_fullymonitored,stations))[0]
        roi_idx[variance_thresholds[-1]] = idx_stations
        return roi_idx
    
class DistanceRoi(roi_generator):
    def generate_rois(self,**kwargs)->dict:
        """
        Generates Regions of Interest (ROIs) based on distance from certain station

        Args:        
            distances (pd.Series): distance of each location from origin station
            distance_thresholds (list): thresholds for each ROI
            n_regions (int): number of ROIs

        Raises:
            ValueError: Check if number of specified distance thresholds matches number of ROIs

        Returns:
            dict: Indices of each ROI. Key specifies the distance threshold
        """
        distances = kwargs['distances']
        distance_thresholds = kwargs['distance_thresholds']
        n_regions = kwargs['n_regions']
        print(f'Determining indices that belong to each ROI. {n_regions} regions with thresholds: {distance_thresholds}')
        if type(distance_thresholds) is not list:
            distance_thresholds = [distance_thresholds]
        if len(distance_thresholds) != n_regions:
            raise ValueError(f'Number of distance thresholds: {distance_thresholds} mismatch specified number of regions: {n_regions}')
        roi_idx = {el:[] for el in distance_thresholds}
        #distance_thresholds = np.insert(distance_thresholds,0,0)
        for i in range(len(distance_thresholds[:-1])):
            print(f'Distance threshold between {distance_thresholds[i]} and {distance_thresholds[i+1]}')
            stations = [j for j in distances[np.logical_and(distances>=distance_thresholds[i],distances<distance_thresholds[i+1])].index]
            print(f'Stations ({len(stations)}): {stations}')
            idx_stations = np.where(np.isin(distances.index,stations))[0]
            roi_idx[distance_thresholds[i]] = idx_stations
        stations = [j for j in distances[distances>=distance_thresholds[-1]].index]
        print(f'Stations with a distance larger than {distance_thresholds[-1]} ({len(stations)}): {stations}')
        idx_stations = np.where(np.isin(distances.index,stations))[0]
        roi_idx[distance_thresholds[-1]] = idx_stations
        
        return roi_idx


class ROI():
    """
    Region of interest (ROI) class. Select a generator from different roigenerator classes.
    Use as:
        roi = ROI(generator())
        roi.deine_ROIs(**kwargs)
    """
    def __init__(self,generator):
        self._generator = generator
    def define_rois(self,**kwargs)->dict:
        self.roi_idx = self._generator.generate_rois(**kwargs)

# file writer classes
class FileWriter(ABC):
    @abstractmethod
    def save(self,**kwargs):
        raise NotImplementedError

class WriteRandomFile(FileWriter):
    def save(self,results_path,locations,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_locations_monitored = kwargs['n_locations_monitored']
        random_seed = kwargs['random_seed']
        
        fname = f'{results_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_locations_monitored}_randomSeed{random_seed}.pkl'
        with open(fname,'wb') as f:
            pickle.dump(locations,f,protocol=pickle.HIGHEST_PROTOCOL)
        print(f'File saved in {fname}')

class WriteSplitRandomFile(FileWriter):
    def save(self,results_path,locations,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_locations_monitored = kwargs['n_locations_monitored']
        random_seed = kwargs['seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']
        
        fname = f'{results_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_locations_monitored}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        with open(fname,'wb') as f:
            pickle.dump(locations,f,protocol=pickle.HIGHEST_PROTOCOL)
        print(f'File saved in {fname}')

class SaveLocations():
    def __init__(self,writer):
        self._writer = writer
    def save_locations(self,results_path,locations,**kwargs):
        self._writer.save(results_path,locations,**kwargs)

# file reader class
class FileReader(ABC):
    @abstractmethod
    def load(self,**kwargs):
        raise NotImplementedError

class ReadRandomFile(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['signal_threshold_ratio']
        n_sensors = kwargs['n_sensors']
        random_seed = kwargs['random_seed']
        fname = f'{file_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors}_randomSeed{random_seed}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
class ReadSplitRandomFile(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_sensors = kwargs['n_sensors']
        random_seed = kwargs['random_seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']

        fname = f'{file_path}SensorsLocations_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
    
class ReadRandomFileBoyd(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        random_seed = kwargs['random_seed']
        n_sensors_Dopt = kwargs['n_sensors_Dopt']
        fname = f'{file_path}SensorsLocations_Boyd_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors_Dopt}_randomSeed{random_seed}.pkl'
        with open(fname,'rb') as f:
            locations_monitored = np.sort(pickle.load(f))
        return locations_monitored
    
class ReadSplitRandomFileBoyd(FileReader):
    def load(self,file_path,**kwargs):
        n = kwargs['n']
        signal_sparsity = kwargs['signal_sparsity']
        variance_threshold_ratio = kwargs['variance_threshold_ratio']
        n_sensors_Dopt = kwargs['n_sensors_Dopt']
        random_seed = kwargs['random_seed']
        seed_subsplit = kwargs['seed_subsplit']
        rois_split = kwargs['rois_split']
        fname = f'{file_path}SensorsLocations_Boyd_N{n}_S{signal_sparsity}_VarThreshold{variance_threshold_ratio}_nSensors{n_sensors_Dopt}_randomSeed{random_seed}_split{rois_split}_subsplitSeed{seed_subsplit}.pkl'
        try:
            with open(fname,'rb') as f:
                locations_monitored = np.sort(pickle.load(f))
            print(f'Loaded file {fname}')
        except:
            warnings.warn(f'No file {fname}')
            return 
        return locations_monitored
    
class ReadLocations():
    def __init__(self,reader):
        self._reader = reader
    def load_locations(self,file_path,**kwargs):
        locations_monitored = self._reader.load(file_path,**kwargs)
        return locations_monitored


# signal reconstruction functions
def singular_value_hard_threshold(snapshots_matrix:np.ndarray,sing_vals:np.array,noise:float=-1)->float:
    """
    Compute singular value hard threshold from Gavish-Donoho approximation

    Args:
        snapshots_matrix (np.ndarray): snapshots matrix used for computing SVD
        sing_vals (np.array): corresponding array of singular values
        noise (float,optional): noise () deviation from signal

    Returns:
        float: cut-off index
    """
    beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]
    if noise == -1:#unknown noise
        c1,c2,c3,c4 = 0.56,0.95,1.82,1.43
        omega = c1*beta**3 - c2*beta**2 + c3*beta + c4
        sing_val_threshold = omega*np.median(sing_vals)
        
    else:#known noise
        t1 = 2*(beta+1)
        t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
        lambda_beta = np.sqrt(t1+t2)
        sing_val_threshold = lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))
    
    sparsity_gd = np.argwhere(sing_vals>=sing_val_threshold)[-1][0] + 1
    return sparsity_gd
def signal_reconstruction_svd(U:np.ndarray,snapshots_matrix:np.ndarray,s_range:np.ndarray) -> pd.DataFrame:
    """
    Decompose signal keeping s-first singular vectors using training set data
    and reconstruct validation set.

    Args:
        U (numpy array): left singular vectors matrix
        snapshots_matrix (numpy array): snaphots matrix data.
        s_range (numpy array): list of sparsity values to test

    Returns:
        rmse_sparsity: dataframe containing reconstruction errors at different times for each sparsity threshold in the range
    """
    print(f'Determining signal sparsity by decomposing training set and reconstructing validation set.\nRange of sparsity levels: {s_range}')
    mse_sparsity = pd.DataFrame()
    error_variance_sparsity = pd.DataFrame()
    for s in s_range:
        # projection
        Psi = U[:,:s]
        #snapshots_matrix_pred_svd = (Psi@Psi.T@snapshots_matrix_centered) + snapshots_matrix_train.mean(axis=1)[:,None]
        snapshots_matrix_pred_svd = Psi@Psi.T@snapshots_matrix
        
        #RMSE across different signal measurements
        # estimated covariance
        error = snapshots_matrix - snapshots_matrix_pred_svd
        error_variance = error.var(axis=1,ddof=0)# estimated coordiante error variance
        mse = pd.DataFrame((error**2).mean(axis=0),columns=[s])
        error_variance_max = pd.DataFrame([error_variance.max()],index=[s])

        mse_sparsity = pd.concat((mse_sparsity,mse),axis=1)
        error_variance_sparsity = pd.concat((error_variance_sparsity,error_variance_max),axis=0)

    return mse_sparsity,error_variance_sparsity

def signal_reconstruction_regression(Psi:np.ndarray,locations_measured:np.ndarray,X_test:pd.DataFrame,X_test_measurements:pd.DataFrame=[],snapshots_matrix_train:np.ndarray=[],snapshots_matrix_test_centered:np.ndarray=[],projected_signal:bool=False,sample_covariance:bool=True)->pd.DataFrame:
    """
    Signal reconstyruction from reduced basis measurement.
    The basis Psi and the measurements are sampled at indices in locations_measured.
    Compute reconstruction error


    Args:
        Psi (np.ndarray): low-rank basis
        locations_measured (np.ndarray): indices of locations measured
        X_test (pd.DataFrame): testing dataset which is measured and used for error estimation
        X_test_measurements (pd.DataFrame): testing dataset measurements projected onto subspace spanned by Psi
        snapshots_matrix_train (np.ndarray): training set snapshots matrix used for computing average
        snapshots_matrix_val_centered (np.ndarray): testing set centered snapshots matrix used for signal reconstruction
        

    Returns:
        rmse (pd.DataFrame): mean reconstruction error between validation data set and reconstructed data
        error_max (pd.DataFrame): max reconstruction error when comparing validation data with reconstructed data
    """
    # basis measurement
    n_sensors_reconstruction = len(locations_measured)
    C = np.identity(Psi.shape[0])[locations_measured]
    Psi_measured = C@Psi
    # regression
    if projected_signal:
        beta_hat = np.linalg.pinv(Psi_measured)@X_test_measurements.iloc[:,locations_measured].T
        snapshots_matrix_predicted = Psi@beta_hat
    else:
        beta_hat = np.linalg.pinv(Psi_measured)@snapshots_matrix_test_centered[locations_measured,:]
        snapshots_matrix_predicted_centered = Psi@beta_hat
        snapshots_matrix_predicted = snapshots_matrix_predicted_centered + snapshots_matrix_train.mean(axis=1)[:,None]
    # compute prediction
    X_pred = pd.DataFrame(snapshots_matrix_predicted.T)
    X_pred.columns = X_test.columns
    X_pred.index = X_test.index
    # compute error metrics
    error = X_test - X_pred
    rmse = pd.DataFrame(np.sqrt(((error)**2).mean(axis=1)),columns=[n_sensors_reconstruction],index=X_test.index)
    error_variance = error.var(axis=0,ddof=0)
    """
    error_max = pd.DataFrame(np.abs(error).max(axis=1),columns=[n_sensors_reconstruction],index=X_test.index)
    error_var = np.zeros(shape = error.shape)
    for i in range(error.shape[0]):
        error_var[i,:] = np.diag(error.iloc[i,:].to_numpy()[:,None]@error.iloc[i,:].to_numpy()[:,None].T)
    error_var = pd.DataFrame(error_var,index=X_test.index,columns=X_test.columns)
    """
    return rmse, error_variance

def hourly_signal_reconstruction(Psi:np.ndarray,X_train:pd.DataFrame,X_val:pd.DataFrame,signal_sparsity:int=1,locations_measured:np.ndarray=[])->dict:
    """
    Compute reconstruction error at different times using low-rank basis
    Args:
        Psi (np.ndarray): monitored low-rank basis
        X_train (pd.DataFrame): training set measurements 
        X_val (pd.DataFrame): validation set measurements
        signal_sparsity (int): sparsity threshold
        locations_measured (np.ndarray): indices of monitored locations

    Returns:
        dict: rmse for multiple measurements at different times
    """
    hours_range = np.sort(X_train.index.hour.unique())
    rmse_time = {el:[] for el in hours_range}
    for h in hours_range:
        # get measurements at certain hour and rearrange as snapshots matrix
        X_train_hour = X_train.loc[X_train.index.hour == h]
        X_val_hour = X_val.loc[X_val.index.hour==h]
        snapshots_matrix_train_hour = X_train_hour.to_numpy().T
        snapshots_matrix_train_hour_centered = snapshots_matrix_train_hour - snapshots_matrix_train_hour.mean(axis=1)[:,None]
        snapshots_matrix_val_hour = X_val_hour.to_numpy().T
        snapshots_matrix_val_hour_centered = snapshots_matrix_val_hour - snapshots_matrix_val_hour.mean(axis=1)[:,None]
        if len(locations_measured) != 0:
            rmse_hour = signal_reconstruction_regression(Psi,locations_measured,snapshots_matrix_train_hour,snapshots_matrix_val_hour_centered,X_val_hour)
        else:# not using sensor placement procedure. Use simple svd reconstruction
            rmse_hour = signal_reconstruction_svd(Psi,snapshots_matrix_train_hour,snapshots_matrix_val_hour_centered,X_val_hour,[signal_sparsity])
        rmse_time[h] = rmse_hour
    return rmse_time

def networkPlanning_iterative(sensor_placement:sp.SensorPlacement,N:int,Psi:np.ndarray,deployed_network_variance_threshold:float,epsilon:float,h_prev:np.ndarray,weights:np.ndarray,n_it:int,locations_monitored:list=[],locations_unmonitored:list=[])->list:
    """
    IRL1 network planning algorithm
    Args:
        sensor_placement (sp.SensorPlacement): sensor placement object containing network information
        N (int): total number of network locations
        deployed_network_variance_threshold (float): error variance threshold for network design
        epsilon (float): IRL1 weights update constant
        h_prev (np.ndarray): network locations initialization
        weights (np.ndarray): IRL1 weights initialization
        n_it (int): IRL1 max iterations
        locations_monitored (list, optional): initialization of set of monitored lcoations. Defaults to [].
        locations_unmonitored (list, optional): initialization of set of unmonitored locaitons. Defaults to [].

    Returns:
        locations (list): indices of monitored and unmonitored locations [S,Sc]
    """
    # iterative method
    it = 0
    time_init = time.time()
    new_monitored = []
    new_unmonitored = []
    while len(locations_monitored) + len(locations_unmonitored) != N:
        # solve sensor placement with constraints
        
        sensor_placement.initialize_problem(Psi,rho=deployed_network_variance_threshold,
                                            w=weights,locations_monitored=locations_monitored,locations_unmonitored=locations_unmonitored)
        sensor_placement.solve()
        print(f'Problem status: {sensor_placement.problem.status}')
        if sensor_placement.problem.status == 'optimal':
            # update sets with new monitored locations
            new_monitored = [i[0] for i in np.argwhere(sensor_placement.h.value >= 1-epsilon) if i[0] not in locations_monitored]
            new_unmonitored = [i[0] for i in np.argwhere(sensor_placement.h.value <= epsilon) if i[0] not in locations_unmonitored]

            locations_monitored += new_monitored
            locations_unmonitored += new_unmonitored
            # check convergence
            if np.linalg.norm(sensor_placement.h.value - h_prev)<=epsilon or it==n_it:
                locations_monitored += [[i for i in np.argsort(sensor_placement.h.value)[::-1] if i not in locations_monitored][0]]
                it = 0
            h_prev = sensor_placement.h.value
            weights_old = weights.copy()
            weights = 1/(h_prev + epsilon)
            it +=1
        else:
            # solver fails at iteration
            #locations_monitored = locations_monitored[:-len(new_monitored)]
            if len(new_unmonitored) != 0:
                locations_unmonitored = locations_unmonitored[:-len(new_unmonitored)]
                weights = weights_old
            it+=1

        print(f'{len(locations_monitored)} Locations monitored: {locations_monitored}\n{len(locations_unmonitored)} Locations unmonitored: {locations_unmonitored}\n')
    time_end = time.time()
    locations = [locations_monitored,locations_unmonitored]
    print(f'IRL1 algorithm finished in {time_end-time_init:.2f}s.')
    return locations

#%% dataset
class Dataset():
    def __init__(self,pollutant:str='O3',N:int=44,start_date:str='2011-01-01',end_date:str='2022-12-31',files_path:str='',synthetic_dataset:bool=False):
        self.pollutant = pollutant
        self.N = N
        self.start_date = start_date
        self.end_date = end_date
        self.files_path = files_path
        self.synthetic_dataset = synthetic_dataset
    
    def load_dataset(self):
        if self.synthetic_dataset:
            fname = f'{self.files_path}SyntheticData_{self.start_date}_{self.end_date}.csv'
        else:
            fname = f'{self.files_path}{self.pollutant}_catalonia_clean_N{self.N}_{self.start_date}_{self.end_date}.csv'
            self.stations_types = pd.read_csv(f'{self.files_path}stations_types.csv',index_col=0)
            self.coordinates = pd.read_csv(f'{self.files_path}coordinates.csv',index_col=0)
            self.coordinates_distances = pd.DataFrame([],index=self.coordinates.index,columns=self.coordinates.index)
            for i in range(self.coordinates.shape[0]):
                for j in range(self.coordinates.shape[0]):
                    self.coordinates_distances.iloc[i,j] = geopy.distance.geodesic(self.coordinates.iloc[i,:],self.coordinates.iloc[j,:]).km
            
        print(f'Loading dataset from {fname}')
        self.ds = pd.read_csv(fname,sep=',',index_col=0)
        self.ds.index = pd.to_datetime(self.ds.index)
        

    def check_dataset(self):
        print(f'Checking missing values in dataset')
        print(f'Percentage of missing values per location:\n{100*self.ds.isna().sum()/self.ds.shape[0]}')
        print(f'Dataset has {self.ds.shape[0]} measurements for {self.ds.shape[1]} locations.\n{self.ds.head()}')

    def sort_stations(self,station_center='Ciutadella'):
        """ Sort order of stations based on distance to one of them"""
        if station_center not in [i for i in self.coordinates_distances.columns]:
            raise ValueError(f'Station used for center is not present in dataset')

        self.distances = dataset.coordinates_distances.loc[station_center]
        self.distances.sort_values(ascending=True,inplace=True)
        self.ds = self.ds.loc[:,[f'O3_{i}' for i in self.distances.index if f'O3_{i}' in self.ds.columns]]
        print(f'Order of dataset locations: {self.ds.columns}')

In [None]:
# figures
class Figures():
    def __init__(self,save_path,figx=2.5,figy=2.5,fs_title=10,fs_label=10,fs_ticks=10,fs_legend=10,marker_size=3,dpi=300,use_grid=False,show_plots=False):
        self.figx = figx
        self.figy = figy
        self.fs_title = fs_title
        self.fs_label = fs_label
        self.fs_ticks = fs_ticks
        self.fs_legend = fs_legend
        self.marker_size = marker_size
        self.dpi = dpi
        self.save_path = save_path
        if show_plots:
            self.backend = 'Qt5Agg'
        else:
            self.backend = 'Agg'
        
        print('Setting mpl rcparams')
        
        font = {'weight':'normal',
                'size':str(self.fs_label),
                }
        
        lines = {'markersize':self.marker_size}
        
        fig = {'figsize':[self.figx,self.figy],
               'dpi':self.dpi
               }
        
        ticks={'labelsize':self.fs_ticks
            }
        axes={'labelsize':self.fs_ticks,
              'grid':False,
              'titlesize':self.fs_title
            }
        if use_grid:
            grid = {'alpha':0.5}
            mpl.rc('grid',**grid)
        
        mathtext={'default':'regular'}
        legend = {'fontsize':self.fs_legend}
        
        mpl.rc('font',**font)
        mpl.rc('figure',**fig)
        mpl.rc('xtick',**ticks)
        mpl.rc('ytick',**ticks)
        mpl.rc('axes',**axes)
        mpl.rc('legend',**legend)
        mpl.rc('mathtext',**mathtext)
        mpl.rc('lines',**lines)        
        mpl.use(self.backend)

    def curve_timeseries_singlestation(self,X:pd.DataFrame,station_name:str,date_init:str='2020-01-20',date_end:str='2021-10-27'):
        date_range = pd.date_range(start=date_init,end=date_end,freq='H')
        date_idx = [i for i in date_range if i in X.index]
        data = X.loc[date_idx,[station_name]]
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.set_xlabel('date')
        ax.set_ylabel('Concentration ($\mu$g/$m^3$)')
        fig.tight_layout()

    def curve_timeseries_allstations(self,X:pd.DataFrame,date_init:str='2020-01-20',date_end:str='2021-10-27',save_fig=False):
        date_range = pd.date_range(start=date_init,end=date_end,freq='H')
        date_idx = [i for i in date_range if i in X.index]
        data = X.loc[date_idx]
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.fill_between(x=data.index,y1=np.percentile(X,axis=1,q=25),y2=np.percentile(X,axis=1,q=75))
        ax.set_xlabel('date')
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        fig.tight_layout()

        if save_fig:
            fname = self.save_path+'timeseries_Allstations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')

    
    def curve_timeseries_dailypattern_singlestation(self,X:pd.DataFrame,station_name:str):
        X_ = X.loc[:,station_name].copy()
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.fill_between(x=data.index,y1=q1,y2=q3,alpha=0.5)
        ax.set_xlabel('hour')
        yrange = np.arange(0,110,10)
        ax.set_yticks(yrange)
        ax.set_yticklabels([i for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        ax.set_ylim(0,100)
        fig.tight_layout()
    
    def curve_timeseries_dailypattern_multiplestations(self,X:pd.DataFrame,stations_locs:list=[0,1,2,3],save_fig:bool=False):
        stations_names = [i for i in X.columns[stations_locs]]
        colors = ['#1a5276','orange','#117864','#943126']
        X_ = X.iloc[:,stations_locs].copy()
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)

        
        fig = plt.figure()
        curves = {}
        for i in range(len(stations_locs)):
            ax = fig.add_subplot(221+i)
            curves[i] = ax.plot(data.iloc[:,i],label=stations_names[i],color=colors[i])
            ax.fill_between(x=data.index,y1=q1.iloc[:,i],y2=q3.iloc[:,i],alpha=0.5,color=colors[i])
            yrange = np.arange(0,110,10)
            ax.set_yticks(yrange)
            ax.set_yticklabels([i for i in ax.get_yticks()])    
            if (221+i)%2 == 1:
                ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
            ax.set_ylim(0,100)
            if i in [2,3]:
                ax.set_xlabel('hour')

        handles = [curves[i][0] for i in curves.keys()]
        fig.legend(handles=[i for i in handles],ncol=2,bbox_to_anchor=(0.95,1.15),framealpha=1)
        fig.tight_layout()

        if save_fig:
            fname = f'{self.save_path}Curve_TimeSeriesHourly_ManyStations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved into {fname}')
        
    def curve_timeseries_dailypattern_allstations(self,X:pd.DataFrame):
        X_ = pd.DataFrame()
        for c in X.columns:
            X_ = pd.concat((X_,X.loc[:,c]),axis=0)
        X_ = X_.loc[:,0]
        data = X_.groupby(X_.index.hour).median()
        q1,q3 = X_.groupby(X_.index.hour).quantile(q=0.25),X_.groupby(X_.index.hour).quantile(q=0.75)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(data)
        ax.fill_between(x=data.index,y1=q1,y2=q3,alpha=0.5)
        ax.set_xlabel('hour')
        yrange = np.arange(0,110,10)
        ax.set_yticks(yrange)
        ax.set_yticklabels([i for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        ax.set_ylim(0,100)
        fig.tight_layout()

    def boxplot_measurements(self,X,save_fig):
        n = X.shape[1]
        yrange = np.arange(0.0,300,50)
        xrange = np.arange(1,n+1,1)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=X,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[i for i in range(len(xrange))],widths=0.5,labels=[str(i) for i in xrange],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        ax.set_ylabel('O$_3$ ($\mu$g/$m^3$)')
        
        xrange = [i-1 for i in xrange if i%5==0]
        ax.set_xticks(xrange)
        ax.set_xticklabels([int(i+1) for i in xrange],rotation=0)
        ax.set_xlabel('Location index')
        fig.tight_layout()
        if save_fig:
            fname = self.save_path+'boxplot_concentration_allStations.png'
            fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')

    def geographical_network_visualization(self,map_path:str,df_coordinates:pd.DataFrame,locations_monitored:np.array=[],roi_idx:dict={},show_legend:bool=False,show_deployed_sensors:bool=True,save_fig:bool=False)->plt.figure:
        """
        Figure showing the geographical area where sensors are deployed along with coordinates of reference stations

        Args:
            map_path (str): path to map file
            df_coordinates (pd.DataFrame): dataframe containing coordiantes(Latitude,Longitude) of each reference station
            locations_monitored (np.array, optional): indices of monitored locations. Defaults to [].
            roi_idx (dict): dictionary indicating indices that belong to each region of interest (ROI) in case of heterogeneous design. The keys correspond to parameter used for separating ROIs.
            show_legend (bool, optional): Show legend indicating monitored and unmonitored locations. Defaults to False.
            save_fig (bool, optional): save generated figure. Defaults to False.

        Returns:
            plt.figure: Figure with map and stations 
        """
        
        if len(locations_monitored)!=0:
            df_coords_monitored = df_coordinates.iloc[locations_monitored]
            df_coords_unmonitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i not in locations_monitored]]
            geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
            geometry_unmonitored = [Point(xy) for xy in zip(df_coords_unmonitored['Longitude'], df_coords_unmonitored['Latitude'])]
            gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
            gdf_unmonitored = GeoDataFrame(df_coords_unmonitored, geometry=geometry_unmonitored)

        else:
            df_coords_monitored = df_coordinates.copy()
            geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
            gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
        
        spain = gpd.read_file(f'{map_path}ll_autonomicas_inspire_peninbal_etrs89.shp')
        catalonia = spain.loc[spain.NAME_BOUND.str.contains('Catalunya')]
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        geo_map = catalonia.plot(ax=ax,color='#117a65')
        
        try:
            if len(roi_idx)!=0:
                markers = ['^','o','s','P','D']
                colors = ['k','#943126']
                if show_deployed_sensors:
                    print('Map showing monitored and unmonitored locations for each ROI')
                    for i,idx,m in zip(range(len(roi_idx)),roi_idx.values(),markers):
                        #locations_monitored_roi = np.array(locations_monitored)[np.isin(locations_monitored,idx)]
                        locations_monitored_roi = np.array([i for i in locations_monitored if i in idx])
                        locations_unmonitored_roi = np.array([i for i in range(df_coordinates.shape[0]) if i not in locations_monitored and i in idx])
                        print(f'locations monitored for ROI {i}: {len(locations_monitored_roi)}\nlocations unmonitored for ROI {i}: {len(locations_unmonitored_roi)}')
                        # monitored locations in ROI
                        df_coords_monitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in locations_monitored_roi]]
                        geometry_monitored = [Point(xy) for xy in zip(df_coords_monitored['Longitude'], df_coords_monitored['Latitude'])]
                        gdf_monitored = GeoDataFrame(df_coords_monitored, geometry=geometry_monitored)
                        gdf_monitored.plot(ax=geo_map, marker=m, color=colors[1], markersize=6,label=f'$\mathcal{{R}}_{i+1}{{\cap}}\mathcal{{S}}$')
                        
                        # unmonitored locations in ROI
                        df_coords_unmonitored = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in locations_unmonitored_roi]]
                        print(f'Shape of unmonitored dataframe coordinates: {df_coords_unmonitored.shape}')
                        geometry_unmonitored = [Point(xy) for xy in zip(df_coords_unmonitored['Longitude'], df_coords_unmonitored['Latitude'])]
                        gdf_unmonitored = GeoDataFrame(df_coords_unmonitored, geometry=geometry_unmonitored)
                        gdf_unmonitored.plot(ax=geo_map, marker=m, color=colors[0], markersize=6,label=f'$\mathcal{{R}}_{i+1}{{\cap}}\mathcal{{S}}^{{c}}$') 

                else: # show icons belonging to each ROI
                    for i,idx,m,c in zip(range(len(roi_idx)),roi_idx.values(),markers,colors):
                        
                        df_coords_idx = df_coordinates.iloc[[i for i in range(df_coordinates.shape[0]) if i in idx]]
                        geometry_idx = [Point(xy) for xy in zip(df_coords_idx['Longitude'], df_coords_idx['Latitude'])]
                        gdf_monitored = GeoDataFrame(df_coords_idx, geometry=geometry_idx)
                        gdf_monitored.plot(ax=geo_map, marker=m, color=c, markersize=6,label=f'$\mathcal{{R}}_{i+1}$')
                
            else:
                gdf_monitored.plot(ax=geo_map, marker='o', color='#943126', markersize=6,label=f'Monitoring node')
                gdf_unmonitored.plot(ax=geo_map, marker='o', color='k', markersize=6,label=f'Unmonitored locations')
        except:
            warnings.warn('No unmonitored locations or unexpected error in dataframe')
        ax.set_xlim(0.0,4.0)
        ax.set_ylim(40.5,43)
        
        ax.set_ylabel('Latitude (degrees)')
        ax.set_xlabel('Longitude (degrees)')

        # set legend location
        if show_legend:
            if show_deployed_sensors:
                if len(roi_idx) == 2:
                    ax.legend(loc='center',ncol=len(roi_idx),framealpha=0,
                              handletextpad=-0.8,columnspacing=5e-4,labelspacing=0.1,bbox_to_anchor=(0.73,0.1))
                elif len(roi_idx)==3:
                    ax.legend(loc='center',ncol=len(roi_idx),framealpha=0,
                              handletextpad=-0.8,columnspacing=1e-6,labelspacing=0.05,bbox_to_anchor=(0.6,0.1))
            else:
                ax.legend(loc='lower right',ncol=1,framealpha=0.1,handletextpad=-0.1,columnspacing=0.5)
        ax.tick_params(axis='both', which='major')
        fig.tight_layout()
        
        # save generated figure
        if save_fig:
            if show_deployed_sensors:
                fname = self.save_path+f'Map_PotentialLocations_{len(roi_idx)}ROIs.png'
            else:
                if len(roi_idx)!=0:
                    fname = self.save_path+f'Map_PotentialLocations_{len(roi_idx)}ROIs.png'
                else:
                    fname = self.save_path+f'Map_PotentialLocations.png'
            fig.savefig(fname,dpi=600,format='png',bbox_inches='tight')
            print(f'Figure saved at {fname}')
        return fig
        

    # Low-rank plots
    def singular_values_cumulative_energy(self,sing_vals,n,synthetic_dataset=False,save_fig=False):
        """
        Plot sorted singular values ratio and cumulative energy

        Parameters
        ----------
        sing_vals : numpy array
            singular values
        n : int
            network size
        save_fig : bool, optional
            save generated figures. The default is False.

        Returns
        -------
        None.

        """
        cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
        xrange = np.arange(0,sing_vals.shape[0],1)
        fig1 = plt.figure()
        ax = fig1.add_subplot(111)
        ax.plot(xrange,cumulative_energy,color='#1f618d',marker='o')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()])
        ax.set_xlabel('$i$th singular value')
        
        #yrange = np.arange(0.5,1.05,0.05)
        yrange = np.arange(0.,1.2,0.2)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        ax.set_ylabel('Cumulative energy')
        if synthetic_dataset:
            ax.set_yscale('log')
        fig1.tight_layout()
        
        fig2 = plt.figure()
        ax = fig2.add_subplot(111)
        ax.plot(xrange, sing_vals / np.max(sing_vals),color='#1f618d',marker='o')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()],rotation=0)
        ax.set_xlabel('$i$th singular value')

        yrange = np.logspace(-4,0,5)
        ax.set_yticks(yrange)
        ax.set_ylabel('Normalized singular values')
        ax.set_ylim(1e-2,1)
        ax.set_yscale('log')
        if synthetic_dataset:
            ax.set_yscale('log')
        fig2.tight_layout()
        
        if save_fig:
            fname = self.save_path+f'Curve_sparsity_cumulativeEnergy_N{n}.png'
            fig1.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at: {fname}')

            fname = self.save_path+f'Curve_sparsity_singularValues_N{n}.png'
            fig2.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at: {fname}')
    
    def singular_values_cumulative_energy_sameFigure(self,sing_vals,n,save_fig=False):
        """
        Plot sorted singular values ratio and cumulative energy in the same figure

        Parameters
        ----------
        sing_vals : numpy array
            singular values
        n : int
            network size
        save_fig : bool, optional
            save generated figures. The default is False.

        Returns
        -------
        None.

        """
        cumulative_energy = np.cumsum(sing_vals)/np.sum(sing_vals)
        xrange = np.arange(0,sing_vals.shape[0],1)
        fig = plt.figure(constrained_layout=True)
        ax = fig.add_subplot(111)

        l1 = ax.plot(xrange, sing_vals / np.max(sing_vals),color='#ba4a00',marker='o',label='Normalized singular values')
        ax.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax.set_xticklabels([int(i+1) for i in ax.get_xticks()],rotation=0)
        ax.set_xlabel('$i$th singular value')
        yrange = np.logspace(-4,0,5)
        ax.set_yticks(yrange)
        ax.set_ylabel('Normalized singular values')
        ax.set_ylim(1e-2,1)
        ax.set_yscale('log')

        ax2 = ax.twinx()
        l2 = ax2.plot(xrange,cumulative_energy,color='#1f618d',marker='o',label='Cumulative energy')
        ax2.set_xticks(np.concatenate(([0.0],np.arange(xrange[9],xrange[-1]+1,10))))
        ax2.set_xticklabels([int(i+1) for i in ax2.get_xticks()])
        
        yrange = np.arange(0.,1.2,0.2)
        ax2.set_yticks(yrange)
        ax2.set_yticklabels([np.round(i,2) for i in ax2.get_yticks()])
        #ax2.set_ylabel('Cumulative energy')
        ax2.set_ylim(0,1)
        
        lines = l1+l2
        labels = [l.get_label() for l in lines]
        #ax.legend(lines,labels,loc='center',ncol=1,framealpha=1.,bbox_to_anchor=(0.5,1.15),handlelength=0.5,handletextpad=0.1)
        #fig.tight_layout()
        
        if save_fig:
            fname = self.save_path+f'Curve_singVals_cumulativeEnergy_N{n}.png'
            fig.savefig(fname,dpi=600,format='png',bbox_inches='tight')
            print(f'Figure saved at: {fname}')


    def boxplot_validation_rmse_svd(self,rmse_sparsity,n,max_sparsity_show=10,synthetic_dataset=False,save_fig=False) -> plt.figure:
        yrange = np.arange(0.0,35,5)
        xrange = rmse_sparsity.columns[:max_sparsity_show]
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=rmse_sparsity.iloc[:,:max_sparsity_show],notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[i for i in range(len(xrange))],widths=0.5,labels=[str(i) for i in xrange],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
        if synthetic_dataset:
            ax.set_yscale('log')
            ax.set_ylim(1e-2,1e1)
        else:
            ax.set_ylim(0,30)
        ax.set_ylabel('RMSE ($\mu$g/$m^3$)')
        xrange = np.array([i-1 for i in xrange if i%5==0])
        ax.set_xticks(xrange)
        ax.set_xticklabels([int(i+1) for i in xrange],rotation=0)
        ax.set_xlabel('Sparsity level')
        fig.tight_layout()

        if save_fig:
            fname = self.save_path+f'boxplot_RMSE_SVDreconstruction_validationSet_Smin{xrange.min()}_Smax{xrange.max()}_N{n}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved in {fname}')
    
        return fig
    
    def boxplot_rmse_comparison(self,rmse_method1:pd.DataFrame,rmse_method2:pd.DataFrame,maxerror:bool=False,save_fig:bool=False)->plt.figure:
        """
        Boxplot comparing validation set RMSE using 2 different numbers of deployed senors.
        E.g: compare fully monitored vs reduced

        Args:
            rmse_method1 (pd.DataFrame): rmse for certain number of sensors
            rmse_method2 (pd.DataFrame): rmse for different number of sensors (for example fully monitored)
            maxerror (bool, optional): dataframes contain maximum reconstruction error instead of RMSE. Defaults to False.
            save_fig (bool, optional): Save generqated figure. Defaults to False.

        Returns:
            plt.figure: Figure
        """
        n_sensors_1 = rmse_method1.columns[0]
        n_sensors_2 = rmse_method2.columns[0]

        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp1 = ax.boxplot(x=rmse_method1,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[0],widths=0.5,labels=[n_sensors_1],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        bp2 = ax.boxplot(x=rmse_method2,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[1],widths=0.5,labels=[n_sensors_2],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        bp1['boxes'][0].set_facecolor('lightgreen')
        bp2['boxes'][0].set_facecolor('#1a5276')
        
        if maxerror:
            yrange = np.arange(0.,55.,5)
            ax.set_ylim(0,50)
        else:
            yrange = np.arange(0.,22.,2)
            ax.set_ylim(0,20)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])

        if maxerror:
            ax.set_ylabel('Max error ($\mu$g/$m^3$)')        
        else:
            ax.set_ylabel('RMSE ($\mu$g/$m^3$)')        
        ax.set_xlabel('Number of deployed sensors')
        fig.tight_layout()

        if save_fig:
            if maxerror:
                fname = f'{self.save_path}Maxerrorcomparison_NsensorsTotal_N1{n_sensors_1}_N2{n_sensors_2}.png'
            else:
                fname = f'{self.save_path}RMSEcomparison_NsensorsTotal_N1{n_sensors_1}_N2{n_sensors_2}.png'
            fig.savefig(fname,dpi=300,format='png')
    
        return fig
    
    def boxplot_errorratio(self,df_error1:pd.DataFrame,df_error2:pd.DataFrame,save_fig:bool=False)->plt.figure:
        n_sensors1 = df_error1.columns[0]
        n_sensors2 = df_error2.columns[0]
        df_ratio = df_error1.to_numpy() / df_error2.to_numpy()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(x=df_ratio,notch=False,vert=True,
                   whis=1.5,bootstrap = None,
                   positions=[0],widths=0.5,labels=[f'{n_sensors1} sensors vs {n_sensors2} senors'],
                   flierprops={'marker':'.','markersize':1},
                   patch_artist=True)
        
        
        bp['boxes'][0].set_facecolor('#1a5276')
        
        yrange = np.arange(0.,3.5,0.5)
        ax.set_ylim(0,3)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])

        ax.set_ylabel('Reconstruction errors ratio')        
        ax.set_xlabel('')
        fig.tight_layout()

        if save_fig:
            fname = f'{self.save_path}ErrorRatio_NsensorsTotal_N1{n_sensors1}_N2{n_sensors2}.png'
            fig.savefig(fname,dpi=300,format='png')
    
        return fig
    
    def hist_worsterror(self,errormax_fullymonitored,errormax_reconstruction,n_sensors,save_fig=False):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(x=errormax_fullymonitored,bins=np.arange(0.,5.1,0.1),density=True,cumulative=False,color='#1a5276',label='Fully monitored network')
        ax.vlines(x=errormax_fullymonitored.mean(),ymin=0.0,ymax=1.0,colors='#1a5276',linestyles='--')
        ax.hist(x=errormax_reconstruction,bins=np.arange(0.,5.1,0.1),density=True,cumulative=False,color='orange',label=f'Reconstruction with {n_sensors} sensors',alpha=0.5)
        ax.vlines(x=errormax_reconstruction.mean(),ymin=0.0,ymax=1.0,colors='orange',linestyles='--')
        ax.set_xlabel('Maximum reconstruction error')
        ax.set_ylabel('Probability density')
        ax.legend(loc='upper left',ncol=1,framealpha=0.5)
        ax.set_xlim(0,5)
        ax.set_ylim(0,1)
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}Histogram_error_fullymonitored_vs_reconstruction_Nsensors{n_sensors}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at {fname}')

    def hist_errorratio(self,errormax_fullymonitored,errormax_reconstruction,n_sensors,save_fig=False):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(x=errormax_reconstruction.to_numpy()/errormax_fullymonitored.to_numpy(),bins=np.arange(0,3.1,0.1),density=True,cumulative=False,color='#1a5276')
        ax.set_xlabel('Maximum error ratio')
        ax.set_ylabel('Probability density')
        ax.set_xlim(0,3)
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}Histogram_errorRatio_Nsensors{n_sensors}.png'
            fig.savefig(fname,dpi=300,format='png')
            print(f'Figure saved at {fname}')
    
    def curve_errorvariance_comparison(self,errorvar_fullymonitored:list,errorvar_reconstruction:list,variance_threshold_ratio:float,worst_coordinate_variance_fullymonitored:float,n:int,n_sensors:int,errorvar_reconstruction_Dopt:list=[],roi_idx:dict={},n_sensors_Dopt:int=0,method:str='random_based',random_seed:int=0,save_fig:bool=False) -> plt.figure:
        """
        Show error variance over a testing set at each network location. 
        The error variance is obtained after reconstructing the signal from p measurements.
        The p measurement locations are obtained from network design algorithm or D-optimality criteria.
        It also shows the threshold line which the network design algorithm used.
        Another algorithm can be shown for comparison.

        Args:
            errorvar_fullymonitored (list): error variance at each network location obtained with a fully monitored network. This corresponds to the lowest error variance possible.
            errorvar_reconstruction (list): error variance at each network locations obtained with a network with a reduced number of deployed sensors.
            variance_threshold_ratio (float): variance threshold ratio used for design algorithm. It is a multiple of the worst_coordinate_variance_fullymonitored.
            worst_coordinate_variance_fullymonitored (float): fully-monitored network worst coordinate error variance
            n (int): total number of network points
            n_sensors (int): number of deployed sensors
            errorvar_reconstruction_Dopt (list): error variance at each network location obtained by D-optimality (or other) criteria. Defaults to [].
            roi_idx (dict): dictionary containing indices of locations that belong to each ROI. The keys indicate the threshold used to separate the network.
            save_fig (bool, optional): Save generated figure. Defaults to False.

        Returns:
            plt.figure: Figure with error variance curves
        """
        if type(variance_threshold_ratio) is float:
            variance_threshold = variance_threshold_ratio*worst_coordinate_variance_fullymonitored
        
            fig = plt.figure()
            ax = fig.add_subplot(111)
            ax.plot(errorvar_fullymonitored,color='#1d8348',label='Fully monitored network')
            if len(errorvar_reconstruction_Dopt) !=0:
                ax.plot(errorvar_reconstruction_Dopt,color='orange',label=f'Joshi-Boyd solution',alpha=0.8)
            ax.plot(errorvar_reconstruction,color='#1a5276',label=f'Network design solution')
            ax.hlines(y=variance_threshold,xmin=0,xmax=n+1,color='k',linestyles='--',label=rf'Design threshold $\rho$={variance_threshold_ratio:.2f}$\rho_n$')
            xrange = np.arange(-1,n,10)
            xrange[0] = 0
            ax.set_xticks(xrange)
            ax.set_xticklabels([i+1 for i in ax.get_xticks()])
            ax.set_xlim(0,n)
            ax.set_xlabel('Location index')
            yrange = np.arange(0,1.75,0.25)
            ax.set_yticks(yrange)
            ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
            ax.set_ylim(0,1.5)
            ax.set_ylabel('Error variance')
            ax.legend(loc='center',ncol=2,framealpha=0.5,bbox_to_anchor=(0.5,1.1))
            fig.tight_layout()
            if save_fig:
                fname = f'{self.save_path}Curve_errorVariance_Threshold{variance_threshold_ratio:.2f}_Nsensors{n_sensors}.png'
                fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
                print(f'Figure saved at {fname}')


        else: # heterogeneous thresholds over multiple ROIs
            variance_threshold = [t*w for t,w in zip(variance_threshold_ratio,worst_coordinate_variance_fullymonitored)]
            # sort coordinate error variance such that the ROIs are shown in order
            coordinate_error_variance_fully_monitored_sorted = np.concatenate([errorvar_fullymonitored[i] for i in roi_idx.values()])
            coordinate_error_variance_design_sorted = np.concatenate([errorvar_reconstruction[i] for i in roi_idx.values()])

            fig = plt.figure(constrained_layout=True)
            ax = fig.add_subplot(111)
            # coordinate error variance at each location
            ax.plot(coordinate_error_variance_fully_monitored_sorted,color='#943126',label='Fully monitored case')
            # horizontal lines showing threshold design
            n_roi = np.concatenate([[0],[len(i) for i in roi_idx.values()]])
            n_roi_cumsum = np.cumsum(n_roi)
            for v,l in zip(variance_threshold,range(len(n_roi_cumsum))):
                if l==0:
                    ax.hlines(y=v,xmin=n_roi_cumsum[l]-1,xmax=n_roi_cumsum[l+1]-1,color='k',linestyles='--',label='Design threshold')
                else:
                    ax.hlines(y=v,xmin=n_roi_cumsum[l],xmax=n_roi_cumsum[l+1]-1,color='k',linestyles='--')
            
            # Joshi Boyd and IRNet results
            if len(errorvar_reconstruction_Dopt) !=0:
                coordinate_error_variance_Dopt_sorted = np.concatenate([errorvar_reconstruction_Dopt[i] for i in roi_idx.values()])
                ax.plot(coordinate_error_variance_Dopt_sorted,color='orange',label=f'JB {n_sensors_Dopt} sensors',alpha=0.8)
            ax.plot(coordinate_error_variance_design_sorted,color='#1a5276',label=f'IRWNet {n_sensors} sensors')
            
            xrange = np.arange(-1,n,10)
            xrange[0] = 0
            ax.set_xticks(xrange)
            ax.set_xticklabels([i+1 for i in ax.get_xticks()])
            ax.set_xlim(-0.5,n)
            ax.set_xlabel('Location index')
            yrange = np.arange(0,3.5,0.5)
            ax.set_yticks(yrange)
            ax.set_yticklabels([np.round(i,2) for i in ax.get_yticks()])
            ax.set_ylim(0,3.0+0.1)
            ax.set_ylabel('Per-coordinate error variance')
            ax.legend(loc='center',ncol=2,framealpha=1,
                      handlelength=0.5,handletextpad=0.1,columnspacing=0.2,
                      bbox_to_anchor=(0.5,0.88))
            #fig.tight_layout()
            if save_fig:
                #fname = f'{self.save_path}Curve_errorVariance_Threshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}_NsensorsROIDopt_{n_sensors_roi}.png'
                if method == 'random_based':
                    fname = f'{self.save_path}Curve_errorVariance_VarThreshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}_randomSeed{random_seed}.png'
                else:
                    fname = f'{self.save_path}Curve_errorVariance_VarThreshold{variance_threshold_ratio}_Nsensors{n_sensors}_NsensorsDopt{n_sensors_Dopt}.png'
                fig.savefig(fname,dpi=300,format='png',bbox_inches='tight')
                print(f'Figure saved at {fname}')


    def curve_rmse_hourly(self,rmse_time,month=0,save_fig=False):
        hours = [i for i in rmse_time.keys()]
        median = [rmse_time[i].median().to_numpy()[0] for i in hours]
        q1,q3 = [rmse_time[i].quantile(q=0.25).to_numpy()[0] for i in hours], [rmse_time[i].quantile(q=0.75).to_numpy()[0] for i in hours]

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(median,color='#1a5276')
        ax.fill_between(x=hours,y1=q1,y2=q3,color='#1a5276',alpha=0.5)
        ax.set_xticks(hours[::4])
        ax.set_xticklabels([i for i in ax.get_xticks()])
        ax.set_xlabel('Hour')
        yrange = np.arange(0,12.,2.)
        ax.set_yticks(yrange)
        ax.set_yticklabels([np.round(i,1) for i in ax.get_yticks()])
        ax.set_ylabel('RMSE ($\mu$g/$m^3$)')
        ax.set_ylim(yrange[0],yrange[-1])
        fig.tight_layout()
        if save_fig:
            fname = f'{self.save_path}deploy_sensors_hourly_month{month}.png'
            fig.savefig(fname,dpi=300,format='png')
        return fig

In [None]:
pollutant = 'O3'
start_date = '2011-01-01'
end_date = '2022-12-31'
N=48
dataset = Dataset(pollutant,N,start_date,end_date,files_path)
dataset.load_dataset()
dataset.check_dataset()
dataset.sort_stations(station_center='Ciutadella')

# train/val/test split
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10
X_train, X_test = train_test_split(dataset.ds, test_size= 1 - train_ratio,shuffle=False,random_state=92)
X_val, X_test = train_test_split(X_test, test_size=test_ratio/(test_ratio + validation_ratio),shuffle=False,random_state=92) 
print(f'Dataset matrix summary:\n {train_ratio} of dataset for training set with {X_train.shape[0]} measurements from {X_train.index[0]} until {X_train.index[-1]}\n {validation_ratio} of dataset for validation set with {X_val.shape[0]} measurements from {X_val.index[0]} until {X_val.index[-1]}\n {test_ratio} of measuerements for testing set with {X_test.shape[0]} measurements from {X_test.index[0]} until {X_test.index[-1]}')

Loading dataset from C:\Users\jp_lp\Documents\Scripts\github\IRNet/files/catalonia/O3_catalonia_clean_N48_2011-01-01_2022-12-31.csv
Checking missing values in dataset
Percentage of missing values per location:
O3_Badalona        0.0
O3_Eixample        0.0
O3_Gracia          0.0
O3_Ciutadella      0.0
O3_Vall-Hebron     0.0
O3_Palau-Reial     0.0
O3_Fabra           0.0
O3_Berga           0.0
O3_Gava            0.0
O3_Granollers      0.0
O3_Igualada        0.0
O3_Manlleu         0.0
O3_Manresa         0.0
O3_Mataro          0.0
O3_Montcada        0.0
O3_El-Prat         0.0
O3_Rubi            0.0
O3_Sabadell        0.0
O3_Sant-Adria      0.0
O3_Sant-Celoni     0.0
O3_Sant-Cugat      0.0
O3_Santa-Maria     0.0
O3_Sant-Vicenç     0.0
O3_Terrassa        0.0
O3_Tona            0.0
O3_Vic             0.0
O3_Viladecans      0.0
O3_Vilafranca      0.0
O3_Vilanova        0.0
O3_Agullana        0.0
O3_Begur           0.0
O3_Pardines        0.0
O3_Santa-Pau       0.0
O3_Bellver         0.0
O3_Juned

In [None]:
snapshots_matrix_train = X_train.to_numpy().T
snapshots_matrix_val = X_val.to_numpy().T
snapshots_matrix_test = X_test.to_numpy().T
snapshots_matrix_train_centered = snapshots_matrix_train - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_val_centered = snapshots_matrix_val - snapshots_matrix_train.mean(axis=1)[:,None]
snapshots_matrix_test_centered = snapshots_matrix_test - snapshots_matrix_train.mean(axis=1)[:,None]
U,sing_vals,Vt = np.linalg.svd(snapshots_matrix_train,full_matrices=False)
print(f'Training snapshots matrix has dimensions {snapshots_matrix_train_centered.shape}.\nLeft singular vectors matrix has dimensions {U.shape}\nRight singular vectors matrix has dimensions {Vt.shape}\nNumber of singular values: {sing_vals.shape}')

Training snapshots matrix has dimensions (48, 5646).
Left singular vectors matrix has dimensions (48, 48)
Right singular vectors matrix has dimensions (48, 5646)
Number of singular values: (48,)


In [None]:
print('\nDetermine signal sparsity from SVD decomposition.\nUse singular values ratios, cumulative energy, or reconstruction error for validation set.')
s_range = np.arange(1,sing_vals.shape[0]+1,1)


Determine signal sparsity from SVD decomposition.
Use singular values ratios, cumulative energy, or reconstruction error for validation set.


In [None]:
mse_sparsity_train,error_variance_max = signal_reconstruction_svd(U,snapshots_matrix_train,s_range)

Determining signal sparsity by decomposing training set and reconstructing validation set.
Range of sparsity levels: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48]


In [None]:
mse_sparsity_train

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,39,40,41,42,43,44,45,46,47,48
0,152.452570,48.177076,46.208355,40.859863,30.466115,29.325522,26.979384,24.792176,24.214296,23.507468,...,4.698700,3.169419,3.160594,3.157035,2.913679,2.054501,1.787663,1.035944,1.035841,5.397912e-27
1,329.654018,135.023199,127.233105,83.110628,53.004731,46.515379,45.176329,44.107139,40.754346,40.650503,...,3.371977,3.348761,3.093716,3.093609,3.082439,1.721390,1.547585,0.841980,0.749069,7.963288e-27
2,364.382861,204.782792,198.556657,116.275952,81.011632,67.669639,66.608120,62.613984,62.562810,60.283802,...,5.953998,5.413739,5.389617,5.389238,4.753449,4.527682,3.952643,0.789955,0.755052,7.794998e-27
3,208.839421,110.430369,103.857846,65.849279,54.589058,50.400850,48.015756,47.596570,47.583911,44.419662,...,2.842910,2.703925,2.696411,2.581830,2.446725,2.293342,1.728629,0.659431,0.458572,5.259073e-27
4,152.033515,101.342441,54.644980,45.622997,43.006014,40.268702,39.219320,39.030241,38.189224,35.915540,...,1.809263,1.703414,1.557197,1.501157,1.283072,1.248155,0.587581,0.328774,0.282295,3.542511e-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5641,256.703926,210.112974,210.000066,197.668997,163.366210,77.821573,77.805304,58.193046,58.091298,57.783542,...,7.534519,7.527112,7.526039,7.307804,5.405120,2.142511,1.280199,0.477282,0.031982,5.782745e-27
5642,198.407547,193.675039,182.913303,173.715170,114.001691,59.831210,57.731868,52.213490,48.467532,46.751103,...,10.795831,9.143900,9.013734,7.435303,5.539351,4.765989,4.544599,0.665321,0.026144,6.435001e-27
5643,169.928549,164.929377,122.637460,120.318898,69.418616,57.830289,54.369322,51.686516,51.654730,45.604604,...,11.982196,11.549310,8.117630,5.562382,4.913814,3.419148,3.237676,2.397131,1.604162,6.218328e-27
5644,205.358632,166.823684,110.592104,107.823840,60.974653,56.984230,48.945112,43.315126,42.724557,36.021772,...,7.721408,7.713147,6.808136,4.038411,3.468865,3.450607,3.105073,3.010785,2.893951,7.194411e-27


In [None]:
mse_sparsity_train.mean()

1     3.116586e+02
2     1.867512e+02
3     1.568485e+02
4     1.334547e+02
5     1.197172e+02
6     1.098020e+02
7     1.014485e+02
8     9.337982e+01
9     8.632710e+01
10    7.981078e+01
11    7.354402e+01
12    6.864016e+01
13    6.425138e+01
14    6.012401e+01
15    5.642905e+01
16    5.298573e+01
17    4.961265e+01
18    4.656035e+01
19    4.358396e+01
20    4.089421e+01
21    3.831613e+01
22    3.578867e+01
23    3.343475e+01
24    3.123173e+01
25    2.914421e+01
26    2.717979e+01
27    2.531459e+01
28    2.354440e+01
29    2.183132e+01
30    2.017993e+01
31    1.858543e+01
32    1.707452e+01
33    1.569030e+01
34    1.432686e+01
35    1.303297e+01
36    1.177176e+01
37    1.059270e+01
38    9.443668e+00
39    8.300923e+00
40    7.222704e+00
41    6.158365e+00
42    5.121004e+00
43    4.160532e+00
44    3.264392e+00
45    2.392513e+00
46    1.546561e+00
47    7.441389e-01
48    3.382757e-27
dtype: float64

In [None]:
np.where(mse_sparsity_train.mean()<ppb)

(array([45, 46, 47], dtype=int64),)

In [None]:
np.where(mse_sparsity_train.mean()<ppb**2)

(array([43, 44, 45, 46, 47], dtype=int64),)

In [None]:
s

33

In [None]:
mean_squared_error(snapshots_matrix_pred_svd,snapshots_matrix_train)

15.690298459024168

In [None]:
np.sqrt(mean_squared_error(snapshots_matrix_pred_svd,snapshots_matrix_train))

3.9610981380198305

In [None]:
ppb**2

3.8415999999999997

In [None]:
mse_sparsity_train

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,39,40,41,42,43,44,45,46,47,48
0,152.452570,48.177076,46.208355,40.859863,30.466115,29.325522,26.979384,24.792176,24.214296,23.507468,...,4.698700,3.169419,3.160594,3.157035,2.913679,2.054501,1.787663,1.035944,1.035841,5.397912e-27
1,329.654018,135.023199,127.233105,83.110628,53.004731,46.515379,45.176329,44.107139,40.754346,40.650503,...,3.371977,3.348761,3.093716,3.093609,3.082439,1.721390,1.547585,0.841980,0.749069,7.963288e-27
2,364.382861,204.782792,198.556657,116.275952,81.011632,67.669639,66.608120,62.613984,62.562810,60.283802,...,5.953998,5.413739,5.389617,5.389238,4.753449,4.527682,3.952643,0.789955,0.755052,7.794998e-27
3,208.839421,110.430369,103.857846,65.849279,54.589058,50.400850,48.015756,47.596570,47.583911,44.419662,...,2.842910,2.703925,2.696411,2.581830,2.446725,2.293342,1.728629,0.659431,0.458572,5.259073e-27
4,152.033515,101.342441,54.644980,45.622997,43.006014,40.268702,39.219320,39.030241,38.189224,35.915540,...,1.809263,1.703414,1.557197,1.501157,1.283072,1.248155,0.587581,0.328774,0.282295,3.542511e-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5641,256.703926,210.112974,210.000066,197.668997,163.366210,77.821573,77.805304,58.193046,58.091298,57.783542,...,7.534519,7.527112,7.526039,7.307804,5.405120,2.142511,1.280199,0.477282,0.031982,5.782745e-27
5642,198.407547,193.675039,182.913303,173.715170,114.001691,59.831210,57.731868,52.213490,48.467532,46.751103,...,10.795831,9.143900,9.013734,7.435303,5.539351,4.765989,4.544599,0.665321,0.026144,6.435001e-27
5643,169.928549,164.929377,122.637460,120.318898,69.418616,57.830289,54.369322,51.686516,51.654730,45.604604,...,11.982196,11.549310,8.117630,5.562382,4.913814,3.419148,3.237676,2.397131,1.604162,6.218328e-27
5644,205.358632,166.823684,110.592104,107.823840,60.974653,56.984230,48.945112,43.315126,42.724557,36.021772,...,7.721408,7.713147,6.808136,4.038411,3.468865,3.450607,3.105073,3.010785,2.893951,7.194411e-27


In [None]:
mse_sparsity_train.mean()

1     3.116586e+02
2     1.867512e+02
3     1.568485e+02
4     1.334547e+02
5     1.197172e+02
6     1.098020e+02
7     1.014485e+02
8     9.337982e+01
9     8.632710e+01
10    7.981078e+01
11    7.354402e+01
12    6.864016e+01
13    6.425138e+01
14    6.012401e+01
15    5.642905e+01
16    5.298573e+01
17    4.961265e+01
18    4.656035e+01
19    4.358396e+01
20    4.089421e+01
21    3.831613e+01
22    3.578867e+01
23    3.343475e+01
24    3.123173e+01
25    2.914421e+01
26    2.717979e+01
27    2.531459e+01
28    2.354440e+01
29    2.183132e+01
30    2.017993e+01
31    1.858543e+01
32    1.707452e+01
33    1.569030e+01
34    1.432686e+01
35    1.303297e+01
36    1.177176e+01
37    1.059270e+01
38    9.443668e+00
39    8.300923e+00
40    7.222704e+00
41    6.158365e+00
42    5.121004e+00
43    4.160532e+00
44    3.264392e+00
45    2.392513e+00
46    1.546561e+00
47    7.441389e-01
48    3.382757e-27
dtype: float64

In [None]:
s

33

In [None]:
mse_sparsity_train.mean()

1     3.116586e+02
2     1.867512e+02
3     1.568485e+02
4     1.334547e+02
5     1.197172e+02
6     1.098020e+02
7     1.014485e+02
8     9.337982e+01
9     8.632710e+01
10    7.981078e+01
11    7.354402e+01
12    6.864016e+01
13    6.425138e+01
14    6.012401e+01
15    5.642905e+01
16    5.298573e+01
17    4.961265e+01
18    4.656035e+01
19    4.358396e+01
20    4.089421e+01
21    3.831613e+01
22    3.578867e+01
23    3.343475e+01
24    3.123173e+01
25    2.914421e+01
26    2.717979e+01
27    2.531459e+01
28    2.354440e+01
29    2.183132e+01
30    2.017993e+01
31    1.858543e+01
32    1.707452e+01
33    1.569030e+01
34    1.432686e+01
35    1.303297e+01
36    1.177176e+01
37    1.059270e+01
38    9.443668e+00
39    8.300923e+00
40    7.222704e+00
41    6.158365e+00
42    5.121004e+00
43    4.160532e+00
44    3.264392e+00
45    2.392513e+00
46    1.546561e+00
47    7.441389e-01
48    3.382757e-27
dtype: float64

In [None]:
mse_sparsity_train.mean().iloc[33]

14.326862001362791

In [None]:
mse_sparsity_train.mean().iloc[34]

13.032968287592013

In [None]:
s

33

In [None]:
mse_sparsity_train.mean().iloc[32]

15.690298459024168

In [None]:
error_variance_max

Unnamed: 0,0
1,935.8645
2,519.0291
3,340.8013
4,281.1129
5,195.3961
6,167.1281
7,160.2439
8,159.5135
9,147.5853
10,141.7807


In [None]:
ppb

1.96

In [None]:
mse_threshold = (1*1.96)**2

In [None]:
signal_sparsity = np.argwhere(mse_sparsity_train.median(axis=0).to_numpy()<=mse_threshold)[0][0] + 1

In [None]:
print(f'Reconstruction error is lower than specified threshold {mse_threshold} in validation set at sparsity of {signal_sparsity}.\nTraining set error of {rmse_sparsity_train.median(axis=0)[signal_sparsity]:.2f}\nValidation set error of {rmse_sparsity_val.median(axis=0)[signal_sparsity]:.2f}\nSingular value ratio: {sing_vals[signal_sparsity]/sing_vals[0]:.2f}\nCumulative energy: {(sing_vals.cumsum()/sing_vals.sum())[signal_sparsity]:.2f}')

Reconstruction error is lower than specified threshold 3.8415999999999997 in validation set at sparsity of 43.
Training set error of 3.04
Validation set error of 4.79
Singular value ratio: 0.02
Cumulative energy: 0.98


In [None]:
print(f'Reconstruction error is lower than specified threshold {mse_threshold:.2f} in validation set at sparsity of {signal_sparsity}.\nTraining set error of {rmse_sparsity_train.median(axis=0)[signal_sparsity]:.2f}\nValidation set error of {rmse_sparsity_val.median(axis=0)[signal_sparsity]:.2f}\nSingular value ratio: {sing_vals[signal_sparsity]/sing_vals[0]:.2f}\nCumulative energy: {(sing_vals.cumsum()/sing_vals.sum())[signal_sparsity]:.2f}')

Reconstruction error is lower than specified threshold 3.84 in validation set at sparsity of 43.
Training set error of 3.04
Validation set error of 4.79
Singular value ratio: 0.02
Cumulative energy: 0.98


In [None]:
print(f'Reconstruction error is lower than specified threshold {mse_threshold:.2f} at sparsity of {signal_sparsity}.\nTraining set error of {rmse_sparsity_train.median(axis=0)[signal_sparsity]:.2f}\nValidation set error of {rmse_sparsity_val.median(axis=0)[signal_sparsity]:.2f}\nSingular value ratio: {sing_vals[signal_sparsity]/sing_vals[0]:.2f}\nCumulative energy: {(sing_vals.cumsum()/sing_vals.sum())[signal_sparsity]:.2f}')

Reconstruction error is lower than specified threshold 3.84 at sparsity of 43.
Training set error of 3.04
Validation set error of 4.79
Singular value ratio: 0.02
Cumulative energy: 0.98


In [None]:
print(f'Reconstruction error is lower than specified threshold {mse_threshold:.2f} at sparsity of {signal_sparsity}.\nSingular value ratio: {sing_vals[signal_sparsity]/sing_vals[0]:.2f}\nCumulative energy: {(sing_vals.cumsum()/sing_vals.sum())[signal_sparsity]:.2f}')

Reconstruction error is lower than specified threshold 3.84 at sparsity of 43.
Singular value ratio: 0.02
Cumulative energy: 0.98


In [None]:
np.where(error_variance_max<ppb)

(array([47], dtype=int64), array([0], dtype=int64))

In [None]:
np.where(error_variance_max<ppb**2)

(array([47], dtype=int64), array([0], dtype=int64))

In [None]:
np.argwhere(error_variance_max<ppb**2)

array([[47,  0]], dtype=int64)

In [None]:
error_variance_max

Unnamed: 0,0
1,935.8645
2,519.0291
3,340.8013
4,281.1129
5,195.3961
6,167.1281
7,160.2439
8,159.5135
9,147.5853
10,141.7807


In [None]:
ppb**2

3.8415999999999997

In [None]:
mean_squared_error(Xclean,Xnoisy)

0.9934791344457543

In [None]:
mean_squared_error(X,Xnoisy)

1.0002968783133455

In [None]:
mean_squared_error(Xclean,X)

0.006956486773803818

In [None]:
mean_squared_error(X,Xnoisy)

1.0002968783133455

In [None]:
mean_squared_error(Xclean,Xnoisy)

0.9934791344457543

In [None]:
error = Xclean - Xnoisy

In [None]:
error.vari(axis=1,ddof=0)

AttributeError: 'numpy.ndarray' object has no attribute 'vari'

In [None]:
error.vari(axis=1,dof=0)

AttributeError: 'numpy.ndarray' object has no attribute 'vari'

In [None]:
error.var(axis=1,dof=0)

TypeError: _var() got an unexpected keyword argument 'dof'

In [None]:
error.var(axis=1,ddof=0)

array([0.85256574, 0.9835518 , 1.00573183, 0.99906453, 0.97637889,
       0.98124265, 0.96394822, 0.93068453, 0.94486441, 1.10652822,
       0.91661229, 0.97938319, 1.04363644, 1.00277707, 0.98540478,
       1.02148667, 1.02024547, 0.94738144, 0.97013516, 0.97467067,
       1.0070798 , 0.94299368, 1.04444549, 0.94000763, 0.96914638,
       0.9469055 , 1.01072629, 1.14311906, 1.01635854, 0.87135729,
       1.05038601, 0.98154228, 0.98163313, 0.96472899, 0.9508085 ,
       1.09130772, 1.00172288, 1.01808532, 1.06428442, 0.94273091,
       1.12345509, 1.00442222, 1.07097674, 0.90539398, 0.99897912,
       0.98215644, 0.93184312, 0.94436618, 1.03040298, 1.1031706 ,
       1.01464687, 0.91765139, 1.04492967, 0.9755099 , 0.94884889,
       0.92349068, 1.0255561 , 1.04179329, 1.00624683, 0.96180348,
       1.00101343, 0.96343212, 0.98899215, 1.06823776, 1.02027853,
       0.93883136, 1.04069555, 0.83060972, 1.04152755, 0.95567116,
       0.9388448 , 0.95750646, 1.02897941, 0.99616637, 0.95354

In [None]:
error.var(axis=1,ddof=0).mean()

0.9918412891065664

In [None]:
error_variance_max

Unnamed: 0,0
1,935.8645
2,519.0291
3,340.8013
4,281.1129
5,195.3961
6,167.1281
7,160.2439
8,159.5135
9,147.5853
10,141.7807


In [None]:
np.where(error_variance_max.to_mnumpy()<ppb)

AttributeError: 'DataFrame' object has no attribute 'to_mnumpy'

In [None]:
np.where(error_variance_max.to_numpy()<ppb)

(array([47], dtype=int64), array([0], dtype=int64))

In [None]:
np.where(error_variance_max.to_numpy()<ppb**2)

(array([47], dtype=int64), array([0], dtype=int64))

In [None]:
error_variance_max

Unnamed: 0,0
1,935.8645
2,519.0291
3,340.8013
4,281.1129
5,195.3961
6,167.1281
7,160.2439
8,159.5135
9,147.5853
10,141.7807


In [None]:
ppb**2

3.8415999999999997

In [None]:
error_variance_max.iloc[-2]

0    12.467612
Name: 47, dtype: float64

In [None]:
snapshots_matrix = snapshots_matrix_train

In [None]:
beta = snapshots_matrix.shape[0]/snapshots_matrix.shape[1]

In [None]:
beta

0.008501594048884165

In [None]:
beta**-1

117.625

In [None]:
beta

0.008501594048884165

In [None]:
t1 = 2*(beta+1)
t2 = (8*beta) / ( beta + 1 + np.sqrt((beta**2 + 14*beta + 1)) )
lambda_beta = np.sqrt(t1+t2)

In [None]:
sing_val_threshold = lambda_beta*noise*np.sqrt(max(snapshots_matrix.shape))

In [None]:
sing_val_threshold

215.16349824122585

In [None]:
sing_vals

array([28456.0740296 ,  5818.15193871,  2846.72914491,  2517.9198153 ,
        1929.49808429,  1639.23774115,  1504.61870213,  1478.73816596,
        1382.51342987,  1328.90053859,  1303.20420585,  1152.81564605,
        1090.59444542,  1057.61443352,  1000.68155549,   966.00709694,
         956.10111867,   909.50403505,   898.1246297 ,   853.78099203,
         835.87077567,   827.62429964,   798.70648941,   772.68046481,
         752.15422557,   729.6399529 ,   710.97312123,   692.63046664,
         681.36330102,   668.98542556,   657.36029746,   639.89623964,
         612.48344355,   607.86691596,   592.16175795,   584.63550058,
         565.27382431,   558.0286537 ,   556.50072604,   540.56066461,
         537.07012627,   530.22005481,   510.19166776,   492.80934462,
         486.09294499,   478.81058096,   466.32916673,   449.07416505])