In [None]:
import pandas as pd
import numpy as np
import glob
import sys
from sklearn.decomposition import PCA
from pyscf import scf,gto,lo
from qml.math import cho_solve
from qml.kernels import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from natsort import natsorted
from sklearn.metrics import mean_squared_error
from master_strange_mol_rep.mol_rep import zero_pad_two_ndarrays as pad
from sklearn.model_selection import KFold

In [None]:
#optimize sigma is a function that optimises the sigma and lambda hyperparameter values. 
def optimize_sigma(data=None,target=None,split=2,kernel='Laplacian',min_sigma=1,step=1000,max_sigma=20000,shuffle=True):
    eta   = np.logspace(-10, 0, 5)
    ert_mae=[]
    ert_nr=[]
    for t in eta:
        Z=pd.concat([pd.DataFrame(data),pd.DataFrame(target)],axis=1)
        kf = KFold(n_splits=split, shuffle=shuffle,random_state=137)
        kf.get_n_splits(Z)
        tab=[]
        for train_index, test_index in kf.split(Z):
            mae=[]
            nr=[]
            X_train = Z.iloc[train_index].drop(list(Z.iloc[train_index].iloc[:,-1:]),axis=1)
            X_test = Z.iloc[test_index].drop(list(Z.iloc[train_index].iloc[:,-1:]),axis=1)
            y_train = Z.iloc[train_index].iloc[:,-1:][list(Z.iloc[train_index].iloc[:,-1:])[0]]
            y_test = Z.iloc[test_index].iloc[:,-1:][list(Z.iloc[test_index].iloc[:,-1:])[0]]
            for i in range(min_sigma,max_sigma,step):
                if kernel == 'Laplacian':
                    K=laplacian_kernel(X_train,X_train,i)
                    K[np.diag_indices_from(K)] +=t
                    v=np.mean(np.abs(np.dot(laplacian_kernel(X_test,X_train,i),cho_solve(K,y_train))-y_test))
                else:
                    K=gaussian_kernel(X_train,X_train,i)
                    K[np.diag_indices_from(K)] +=t
                    v=np.mean(np.abs(np.dot(gaussian_kernel(X_test,X_train,i),cho_solve(K,y_train))-y_test))
                mae.append(v)
                nr.append(i)
            A=pd.DataFrame(mae,columns=['mae'])
            B=pd.DataFrame(nr,columns=['Nr'])
            C=pd.concat([A,B],axis=1)
            tab.append(C)
        ert_mae.append((sum(tab)/len(tab)).loc[(sum(tab)/len(tab))['mae'] == (sum(tab)/len(tab))['mae'].min()]['mae'])
        ert_nr.append((sum(tab)/len(tab)).loc[(sum(tab)/len(tab))['mae'] == (sum(tab)/len(tab))['mae'].min()]['Nr'])
    return(ert_mae, ert_nr,eta)

In [None]:
#The function kernel_ridge_regression is used to generate all of the learning curves. 
def kernel_ridge_regression(target=None,input_data=None,kernel='Laplacian',step=200,test_size=0.2,sigma=1,lambd=1e-5):  
    MAE=[]
    std_A=[]
    std_B=[]
    std_D=[]
    R2=[]
    nr=[]
    RMSE=[]
    prop=target
    data=input_data
    sigma=sigma
    for i in range(100,int(float(input_data.shape[0])*(1.0-test_size)),step):
        mae=[]
        r2=[]
        rmse=[]
        for s in range(0,6,1):
            X,x,Y,y=train_test_split(data,prop,test_size=test_size,random_state=137,shuffle=True)
            K=laplacian_kernel(X[:i],X[:i],sigma)
            K[np.diag_indices_from(K)] +=lambd
            alpha=cho_solve(K,Y[:i])
            Ks=laplacian_kernel(x,X[:i],sigma)
            y_pred=np.dot(Ks,alpha)
            v=np.mean(np.abs(y_pred-y))
            mae.append(v)
            r2.append(r2_score(y,y_pred))
            rmse.append(mean_squared_error(y, y_pred, squared=False))
        MAE.append(np.mean(mae))
        R2.append(np.mean(r2))
        RMSE.append(np.mean(rmse))
        nr.append(i)
        std_A.append(np.std(mae))
        std_B.append(np.std(r2))
        std_D.append(np.std(rmse))
    A=pd.DataFrame(MAE,columns=['mae'])
    B=pd.DataFrame(R2,columns=['r2'])
    D=pd.DataFrame(RMSE,columns=['rmse'])
    T=pd.DataFrame(nr,columns=['Nr'])
    stdA=pd.DataFrame(std_A,columns=['std_MAE'])
    stdB=pd.DataFrame(std_B,columns=['std_R^2'])
    stdD=pd.DataFrame(std_D,columns=['std_RMSE'])
    C=pd.concat([T,A,stdA,B,stdB,D,stdD],axis=1)
    return C