In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold, RepeatedKFold, train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from scipy.stats import pearsonr
import time
import os
import matplotlib.pyplot as plt

from operon import RSquared, Dataset
from operon.sklearn import SymbolicRegressor

from pmlb import fetch_data, dataset_names, classification_dataset_names, regression_dataset_names
from itertools import product

dataframes = []

l = len(regression_dataset_names)
for idx, name in enumerate(regression_dataset_names):
    path = './data/{}'.format(name)
    if not os.path.exists(path):
        os.makedirs(path)
    X, y = fetch_data(name, return_X_y=True, local_cache_dir='./data/')
    
    if X.shape[0] > 3000:
        continue

    reps = 50
    
    df = pd.DataFrame(columns=['problem', 'rows', 'features','train_score', 'test_score', 'elapsed', 'stats', 'best_params'])
    
    hyper_params = {
        'population_size' : (100, 500, 1000,),
        'allowed_symbols' : ('add,sub,mul,div,constant,variable', 'add,sub,mul,div,exp,log,sqrt,square,constant,variable',),
        'max_length' : (10, 25, 50,),
    }
    
    reg = SymbolicRegressor(
        local_iterations=10,
        n_threads=24,
        max_evaluations=int(1e6),
        random_state=None,
        offspring_generator='os',
        female_selector='random',
        male_selector='random'
    )
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, shuffle=True)

    cv = RepeatedKFold(n_repeats=5, n_splits=5, random_state=None)
    grid_est = GridSearchCV(reg, cv=cv, param_grid=hyper_params, verbose=1, n_jobs=1, scoring='r2', error_score=0.0)
    grid_est.fit(X_train, y_train)
        
    reg = grid_est.best_estimator_
    best_params = grid_est.best_params_

    for i in range(reps):
        start = time.time()
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, shuffle=True)
        
        reg.fit(X_train, y_train, show_model=False)
        y_pred_train = reg.predict(X_train)
        y_pred_test = reg.predict(X_test)
        
        score_train = RSquared(y_pred_train, y_train)
        score_test = RSquared(y_pred_test, y_test)
        
        end = time.time()
        df.loc[i] = [name, X.shape[0], X.shape[1], score_train, score_test, end - start, str(reg._stats), str(best_params)]
        
    print(f'{idx+1}/{l}', name, best_params, df['train_score'].median(), df['test_score'].median())
    
    dataframes.append(df)
    

In [None]:
df_all = pd.concat(dataframes)

stats = df_all.groupby(['problem']).agg({
    'rows' : 'first',
    'features' : 'first',
    'settings' : 'first',
    'train_score' : 'median',
    'test_score' : 'median',
    'elapsed' : 'median'
}).sort_values(by=['rows'])
stats.to_csv('pmbl-regression-results.csv')