In [None]:
# Common imports
import os
import numpy as np
import pandas as pd

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Root name of the dataset location.
DATASET_ROOT_DIR = 'datasets'

# Name of the folder containing all datasets generated with the newton-based approximation method.
NEWTON = 'newtonParams'

# Name of the folder containing all datasets generated with the genetic-based approximation method.
GENETIC = 'geneticParams'

# Name of the output directory where all resulting files of the present work will be stored.
OUTPUT_DIR = 'models'

In [None]:
# This dictionary is used as one-hot encoding in which a unique integer index is associated with every geometry in the datasets.
GEOMETRY = {'degenerate': 1, 'disconnected': 2, 'linear': 3, 'concave': 4, 'mixed': 5, 'convex': 6}

def get_datasets_csv_reader(dir_path, training_size = 0.8, shuffle=True, norm=None):
    '''
    Reads a .csv file and creates the training and testing sets.
    
    Args:
        dir_path (str): path of the source .csv file.
        training_size (float): Percentage of instances that will belong to the training set. Default: 0.8.
        shuffle (bool): Whether to shuffle the data or not. Default: True.
        norm (str): Whether to normilize the data or not. Options: min_max or std. Default: None.
        
    Returns:
        train_data (numpy ndarray, float32): Matrix containing instances for training (training set).
        train_targets (numpy ndarray, float64): Array containing all true target values of the training set.
        test_data (numpy ndarray, float32): Matrix containing instances for testing (testing set).
        test_targets (numpy ndarray, float64): Array containing all true target values of the testing set.
    '''
    df = pd.read_csv(dir_path)
    df = df.replace({'geometry': GEOMETRY})
    if shuffle:
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
        
    targets = df[' alpha'].to_numpy()
    df.drop(' alpha', axis='columns', inplace=True)
    data = np.asarray(df.values).astype('float32')
    
    if norm is not None:
        if norm == 'min_max':
            scaler = MinMaxScaler()
        elif norm == 'std':
            scaler = StandardScaler()
        data = scaler.fit_transform(data)
    
    train_size = int(len(data) * training_size)

    train_data = data[:train_size]
    train_targets = targets[:train_size]

    test_data = data[train_size:]
    test_targets = targets[train_size:]
    
    return train_data, train_targets, test_data, test_targets

# Newton-based approximation method

## **Gaussian $\alpha$-energy (GAE)**

In [None]:
gae_dir = os.path.join(DATASET_ROOT_DIR, NEWTON, 'GAE.csv')

train_data, train_targets, test_data, test_targets = get_datasets_csv_reader(gae_dir)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

### **Normalization**

#### **Standardization**

In [None]:
scaler = StandardScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

#### **Min-Max**

In [None]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

## **Modified Pöschl-Teller Potential (MPT)**

In [None]:
mpt_dir = os.path.join(DATASET_ROOT_DIR, NEWTON, 'MPT.csv')

train_data, train_targets, test_data, test_targets = get_datasets_csv_reader(mpt_dir)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

### **Normalization**

#### **Standardization**

In [None]:
scaler = StandardScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

#### **Min-Max**

In [None]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

## **Geneti-based approximation method**

## **Gaussian $\alpha$-energy (GAE)**

In [None]:
gae_dir = os.path.join(DATASET_ROOT_DIR, GENETIC, 'GAE.csv')
train_data, train_targets, test_data, test_targets = get_datasets_csv_reader(gae_dir)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

### **Normalization**

#### **Standardization**

In [None]:
scaler = StandardScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

#### **Min-Max**

In [None]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

## **Kratzer Potential (KRA)**

In [None]:
kra_dir = os.path.join(DATASET_ROOT_DIR, GENETIC, 'KRA.csv')
train_data, train_targets, test_data, test_targets = get_datasets_csv_reader(kra_dir)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

### **Normalization**

#### **Standardization**

In [None]:
scaler = StandardScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

#### **Min-Max**

In [None]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

## **Modified Pöschl-Teller Potential (MPT)**

In [None]:
mpt_dir = os.path.join(DATASET_ROOT_DIR, GENETIC, 'MPT.csv')

train_data, train_targets, test_data, test_targets = get_datasets_csv_reader(mpt_dir)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

### **Normalization**

#### **Standardization**

In [None]:
scaler = StandardScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

#### **Min-Max**

In [None]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

## **Pöschl-Teller Potential (PTP)**

In [None]:
ptp_dir = os.path.join(DATASET_ROOT_DIR, GENETIC, 'PTP.csv')
train_data, train_targets, test_data, test_targets = get_datasets_csv_reader(ptp_dir)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

### **Normalization**

#### **Standardization**

In [None]:
scaler = StandardScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

#### **Min-Max**

In [None]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

## **Riesz $s$-energy (RSE)**

In [None]:
rse_dir = os.path.join(DATASET_ROOT_DIR, GENETIC, 'RSE.csv')
train_data, train_targets, test_data, test_targets = get_datasets_csv_reader(rse_dir)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

### **Normalization**

#### **Standardization**

In [None]:
scaler = StandardScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required

#### **Min-Max**

In [None]:
scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# Add the regressor method here

In [None]:
# Add here the k-fold if required