In [1]:
import numpy as np
import pandas as pd
import os
from glob import glob

from sklearn.feature_selection import VarianceThreshold, mutual_info_regression, f_regression, SelectKBest, RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from scipy.optimize import root_scalar
from scipy.stats import gmean

from expon_mixture import ExponMixture 

In [2]:
fits = pd.read_csv('./fits/expon_mix_2comp_fits.txt', index_col=0)

In [3]:
fits[fits['p1']<1.0].describe()

Unnamed: 0,p1,p2,lambda1,lambda2
count,158.0,158.0,158.0,158.0
mean,0.145675,0.854325,36134210.0,2132482000.0
std,0.299654,0.299654,418870200.0,12397640000.0
min,0.003333,0.003333,18.0,459.9383
25%,0.013333,0.933333,111.75,308195.9
50%,0.023333,0.976667,276.7045,3456719.0
75%,0.066667,0.986667,706.9048,52703120.0
max,0.996667,0.996667,5248641000.0,130546600000.0


In [4]:
fits

Unnamed: 0_level_0,p1,p2,lambda1,lambda2
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
barthel/n210/gen_n210_m903_k3SAT_seed2473397791.cnf,1.000000,,3.214107e+03,
barthel/n210/gen_n210_m903_k3SAT_seed862748622.cnf,1.000000,,1.342518e+04,
barthel/n210/gen_n210_m903_k3SAT_seed4006075830.cnf,1.000000,,1.295157e+03,
barthel/n210/gen_n210_m903_k3SAT_seed1547818438.cnf,1.000000,,1.365623e+03,
barthel/n210/gen_n210_m903_k3SAT_seed3919912883.cnf,1.000000,,1.511253e+03,
...,...,...,...,...
qhid/n70/gen_n70_m385_k3SAT_seed2030441879.cnf,1.000000,,3.993567e+02,
qhid/n70/gen_n70_m385_k3SAT_seed3717411169.cnf,1.000000,,2.097433e+02,
qhid/n70/gen_n70_m385_k3SAT_seed684617509.cnf,1.000000,,2.370867e+02,
qhid/n70/gen_n70_m385_k3SAT_seed1293934752.cnf,1.000000,,1.008646e+11,


In [5]:
df = pd.read_csv('./calculate_features/features_train.csv')
df.set_index('instance', inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,nvarsOrig,nclausesOrig,nvars,nclauses,reducedVars,reducedClauses,Pre-featuretime,vars-clauses-ratio,POSNEG-RATIO-CLAUSE-mean,POSNEG-RATIO-CLAUSE-coeff-variation,...,gsat_FirstLocalMinStep_Q.10,gsat_FirstLocalMinStep_Q.90,gsat_BestAvgImprovement_Mean,gsat_BestAvgImprovement_CoeffVariance,gsat_FirstLocalMinRatio_Mean,gsat_FirstLocalMinRatio_CoeffVariance,ls-gsat-featuretime,lobjois-mean-depth-over-vars,lobjois-log-num-nodes-over-vars,lobjois-featuretime
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
barthel/n210/gen_n210_m903_k3SAT_seed3555821415.cnf,210.0,903.0,199.0,892.0,0.055276,0.012332,0.0,0.223094,0.488565,0.604264,...,39.0,52.0,0.566566,0.41586,0.857284,0.050368,2.72,0.388437,0.765035,1.14
barthel/n210/gen_n210_m903_k3SAT_seed862748622.cnf,210.0,903.0,203.0,897.0,0.034483,0.006689,0.0,0.22631,0.493683,0.586009,...,40.0,53.0,0.550169,0.417447,0.849422,0.054019,2.72,0.365887,0.719841,1.1
barthel/n210/gen_n210_m903_k3SAT_seed3576518490.cnf,210.0,903.0,204.0,892.0,0.029412,0.012332,0.0,0.2287,0.501121,0.583115,...,40.0,53.0,0.554184,0.416429,0.858623,0.049907,2.62,0.365476,0.700129,1.12
barthel/n210/gen_n210_m903_k3SAT_seed2527888016.cnf,210.0,903.0,201.0,894.0,0.044776,0.010067,0.0,0.224832,0.495153,0.609906,...,40.0,53.0,0.579225,0.412855,0.86,0.051025,2.66,0.386035,0.933892,1.16
barthel/n210/gen_n210_m903_k3SAT_seed3919912883.cnf,210.0,903.0,205.0,897.0,0.02439,0.006689,0.0,0.22854,0.487737,0.589016,...,40.0,53.0,0.5637,0.410805,0.86364,0.048454,2.7,0.383312,0.786043,1.16


In [7]:
merged_data = pd.concat([df, fits], join='inner', axis=1)

In [8]:
df.describe()

Unnamed: 0,nvarsOrig,nclausesOrig,nvars,nclauses,reducedVars,reducedClauses,Pre-featuretime,vars-clauses-ratio,POSNEG-RATIO-CLAUSE-mean,POSNEG-RATIO-CLAUSE-coeff-variation,...,gsat_FirstLocalMinStep_Q.10,gsat_FirstLocalMinStep_Q.90,gsat_BestAvgImprovement_Mean,gsat_BestAvgImprovement_CoeffVariance,gsat_FirstLocalMinRatio_Mean,gsat_FirstLocalMinRatio_CoeffVariance,ls-gsat-featuretime,lobjois-mean-depth-over-vars,lobjois-log-num-nodes-over-vars,lobjois-featuretime
count,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,...,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0
mean,112.815249,541.422287,110.900293,539.140762,0.013008,0.003314,0.0,0.197721,0.498235,0.581916,...,22.325513,31.953079,0.926597,0.414158,0.865316,0.07814,1.537595,0.421316,0.872441,0.642991
std,60.99415,236.932698,58.957727,234.682409,0.014942,0.00463,0.0,0.020111,0.014546,0.0083,...,11.267192,13.411502,0.293956,0.013655,0.014012,0.022472,0.839043,0.061129,0.071821,0.29698
min,30.0,165.0,28.0,162.0,0.0,0.0,0.0,0.17284,0.442424,0.557452,...,5.0,10.0,0.507199,0.343572,0.828435,0.045102,0.24,0.351538,0.644515,0.22
25%,70.0,385.0,69.0,384.0,0.0,0.0,0.0,0.181818,0.489067,0.576825,...,15.0,23.0,0.597536,0.409025,0.855741,0.054939,0.88,0.380861,0.885559,0.44
50%,90.0,495.0,90.0,494.0,0.010101,0.001927,0.0,0.190559,0.497151,0.578959,...,19.0,29.0,0.918244,0.416518,0.861658,0.077603,1.28,0.396648,0.903745,0.54
75%,180.0,774.0,177.0,771.0,0.021505,0.005195,0.0,0.225617,0.506494,0.585587,...,34.0,46.0,1.12509,0.423058,0.873974,0.091178,2.64,0.44353,0.912372,0.96
max,220.0,946.0,219.0,945.0,0.071429,0.032432,0.0,0.232334,0.545455,0.620093,...,44.0,57.0,1.60989,0.437284,0.914436,0.163919,2.86,0.64324,0.937382,1.24


In [9]:
y = merged_data[['p1', 'lambda1', 'lambda2']]

In [10]:
y = y.fillna(-1000)

In [11]:
X = merged_data.drop(['p1', 'p2', 'lambda1', 'lambda2'], axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
regr_rf = RandomForestRegressor()
regr_rf.fit(X_train, y_train)

RandomForestRegressor()

In [14]:
y_rf = regr_rf.predict(X_test)
pd.concat([y_test, pd.DataFrame(y_rf, index=y_test.index)], axis=1)

Unnamed: 0_level_0,p1,lambda1,lambda2,0,1,2
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
qhid/n40/gen_n40_m220_k3SAT_seed3464545457.cnf,0.03,82.11111,368383.8,0.332367,1020338000.0,85803130.0
qhid/n50/gen_n50_m275_k3SAT_seed1113916997.cnf,0.033333,189.8,874855.8,0.4965,13265090000.0,675481100.0
komb/n110/gen_n110_m573_k3SAT_seed2530876845.cnf,0.04,596.8333,5576362.0,0.136933,505244.5,245269500.0
barthel/n210/gen_n210_m903_k3SAT_seed2814938700.cnf,1.0,7983.777,-1000.0,0.799367,10685.37,9105135.0
qhid/n30/gen_n30_m165_k3SAT_seed443094728.cnf,1.0,94.64333,-1000.0,0.513133,1054884000.0,326371800.0
qhid/n30/gen_n30_m165_k3SAT_seed3097603022.cnf,0.036667,18.0,294874.7,0.499267,555147200.0,35271110000.0
komb/n100/gen_n100_m520_k3SAT_seed2585650757.cnf,0.026667,346.25,62169960.0,0.2433,1305862000.0,754016200.0
komb/n120/gen_n120_m625_k3SAT_seed4111198820.cnf,1.0,24986160000.0,-1000.0,0.090167,500824.8,5235579000.0
barthel/n210/gen_n210_m903_k3SAT_seed1250623881.cnf,1.0,245565.9,-1000.0,0.222567,745028.7,341192300.0
komb/n80/gen_n80_m416_k3SAT_seed599310826.cnf,0.026667,817.875,715141.0,0.2013,2000490.0,73473390.0


In [15]:
y_rf

array([[ 3.32366667e-01,  1.02033844e+09,  8.58031323e+07],
       [ 4.96500000e-01,  1.32650859e+10,  6.75481077e+08],
       [ 1.36933333e-01,  5.05244487e+05,  2.45269482e+08],
       [ 7.99366667e-01,  1.06853721e+04,  9.10513535e+06],
       [ 5.13133333e-01,  1.05488438e+09,  3.26371843e+08],
       [ 4.99266667e-01,  5.55147217e+08,  3.52711059e+10],
       [ 2.43300000e-01,  1.30586155e+09,  7.54016231e+08],
       [ 9.01666667e-02,  5.00824753e+05,  5.23557930e+09],
       [ 2.22566667e-01,  7.45028746e+05,  3.41192325e+08],
       [ 2.01300000e-01,  2.00049010e+06,  7.34733909e+07],
       [ 7.53233333e-01,  1.13743363e+02,  3.16701903e+06],
       [ 7.13800000e-01,  1.01140045e+06,  2.15167919e+08],
       [ 4.97600000e-01,  4.52684248e+06,  3.53337420e+10],
       [ 9.79366667e-01,  5.03766051e+05,  5.48803164e+05],
       [ 3.58633333e-01,  8.07843100e+09,  8.30088959e+07],
       [ 5.65700000e-01,  4.51674482e+06,  3.14062769e+10],
       [ 5.88200000e-01,  1.07512633e+09

In [16]:
v2 = merged_data[merged_data['p1']<1.0]
y = v2[['p1', 'lambda1', 'lambda2']]
X = v2.drop(['p1', 'p2', 'lambda1', 'lambda2'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
regr_rf = RandomForestRegressor()
regr_rf.fit(X_train, y_train)

RandomForestRegressor()

In [18]:
y_rf = regr_rf.predict(X_test)
results = pd.concat([y_test, pd.DataFrame(y_rf, index=y_test.index)], axis=1)
results.drop(['p1', 'lambda1', 'lambda2'], inplace=True, axis=1)
results.rename(columns={0:'p1', 1:'lambda1', 2:'lambda2'}, inplace=True)
results['n']=list(map(lambda s: int(s.split("/")[1].replace("n","")), list(results.index)))

In [19]:
def condition(t, rv, b):
    if t <= 0:
        return -np.inf
    F = rv.cdf(t)
    result = (F - 1.0)*t
    result += F*(1-F)/(rv.pdf(t))
    result -= rv.partial_exp(t)
    return result - b

In [20]:
def find_restart_time(p1, lambda1, lambda2, n):
    ps = [p1, 1.0-p1]
    scales = [lambda1, lambda2]
    rv = ExponMixture(ps, scales)
    b = 1.5*n
    solution = root_scalar(condition, args=(rv, b), x0=10.0*b, x1=b, method='secant', xtol=1.0)
    return solution.root + b

In [21]:
def estimate_restart_expectation(t, data):
    step = 1.0/len(data)
    p = np.searchsorted(data, t, side='right')
    if p == 0:
        return np.inf
    p = float(p)*step
    result = 0.0
    i = 0
    for r in data:
        if r > t:
            break
        result += float(r)
        i += 1
    if i > 0:
        result /= i
    result += (1-p)/p * t
    return result

In [22]:
def evaluate_restart_time(instance, t):
    base_path = '../outputs/output_train'
    path = os.path.join(base_path, instance)
    path = glob(path + '*')[0]
    data = np.loadtxt(path, usecols=0)
    data = np.sort(data)
    speedup = np.mean(data)/estimate_restart_expectation(t, data)
    return speedup

In [23]:
restart_times = []
speedups = []
for index, row in results.iterrows():
    time = find_restart_time(row['p1'], row['lambda1'], row['lambda2'], row['n'])
    time = np.ceil(time)
    restart_times.append(time)
    speedup = evaluate_restart_time(index, time)
    speedups.append(speedup)

In [24]:
np.exp(np.mean(np.log(list(filter(lambda x: x> 0.0, speedups)))))

7.066987129505279

In [25]:
results['restart'] = restart_times
results['speedup'] = speedups

In [26]:
p = 0.047233
l1 = 4.917930e+02
l2 = 1.050007e+09
ExponMixture([p, 1-p], [l1, l2]).find_restart_time(n=90)

467.0

In [28]:
results

Unnamed: 0_level_0,p1,lambda1,lambda2,n,restart,speedup
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
komb/n110/gen_n110_m573_k3SAT_seed4018851091.cnf,0.101833,319436600.0,3121542000.0,110,526648.0,1.173166
barthel/n180/gen_n180_m774_k3SAT_seed2083207871.cnf,0.472367,52498280.0,1781612000.0,180,242843.0,1.0
qhid/n60/gen_n60_m330_k3SAT_seed4276262007.cnf,0.1992,136640200.0,5185286000.0,60,189351.0,20.651745
komb/n110/gen_n110_m573_k3SAT_seed3259052812.cnf,0.076533,3013.03,199665100.0,110,1147.0,3.979924
qhid/n70/gen_n70_m385_k3SAT_seed1664766191.cnf,0.0639,9033800.0,176771900.0,70,62865.0,25.735225
komb/n80/gen_n80_m416_k3SAT_seed3163119786.cnf,0.0482,13550770.0,528915200.0,80,73741.0,1.086651
komb/n90/gen_n90_m468_k3SAT_seed2536146026.cnf,0.072733,4099.746,14218200.0,90,1184.0,25.080672
qhid/n30/gen_n30_m165_k3SAT_seed665600859.cnf,0.106433,165.1769,7937603000.0,30,160.0,87.859049
komb/n110/gen_n110_m573_k3SAT_seed1194819985.cnf,0.0372,1182.383,135679900.0,110,750.0,80.828487
komb/n100/gen_n100_m520_k3SAT_seed3451733972.cnf,0.058133,793.6706,7931292.0,100,606.0,2.821363


In [29]:
log_speedup = np.sum(np.log(speedups))

  log_speedup = np.sum(np.log(speedups))


In [30]:
sel = VarianceThreshold()

In [31]:
a = sel.fit_transform(df)

In [32]:
a.shape

(341, 75)

In [33]:
df = pd.read_csv('./calculate_features/features.csv')
df.set_index('instance', inplace=True)

In [34]:
y_rf = regr_rf.predict(df)

In [35]:
y_rf

array([[2.18800000e-01, 1.04990902e+08, 5.76973463e+09],
       [1.58000000e-01, 5.25024610e+07, 1.68297138e+10],
       [1.76566667e-01, 1.04992187e+08, 1.14157254e+10],
       [1.66800000e-01, 1.04995683e+08, 1.69791036e+10],
       [1.76700000e-01, 1.04998636e+08, 1.67973637e+10],
       [1.39066667e-01, 5.25022652e+07, 1.69712466e+10],
       [1.65966667e-01, 1.04993300e+08, 1.69100843e+10],
       [1.75366667e-01, 1.04989002e+08, 1.08668464e+10],
       [1.47300000e-01, 5.25023426e+07, 1.35377735e+10],
       [1.65900000e-01, 1.04985624e+08, 1.14942400e+10],
       [1.39000000e-01, 5.24990561e+07, 1.68963204e+10],
       [1.74100000e-01, 1.04985029e+08, 1.72199557e+10],
       [1.65566667e-01, 1.04983904e+08, 1.74965471e+10],
       [1.28933333e-01, 5.24957399e+07, 1.69806859e+10],
       [1.75866667e-01, 1.04993732e+08, 1.69101663e+10],
       [1.64866667e-01, 1.04999728e+08, 1.68925847e+10],
       [2.02000000e-01, 1.04998908e+08, 5.13575406e+09],
       [1.83366667e-01, 5.25190

In [36]:
for y in y_rf:
    print(find_restart_time(y[0], y[1], y[2], 750))

578712.8063727173
379540.7521809987
552771.691328064
544707.7005053195
547538.0327588739
375818.39260994707
544526.4743022914
553294.6383753115
378531.0322475098
549982.4881223975
375814.99582812516
546503.7999775728
544003.5964473033
373929.6470849377
547218.3022669315
544264.2465666509
578190.4450973955
394074.4384551974
596365.795378682
579136.1532651266
577605.7784137382
544380.274311692
383221.9433310152
573967.1806520014
563104.1317737063
553558.5249699886
578346.2938096684
537463.9444145153
394502.3441558703
584068.9256928668
564230.4605910729
585808.807697013
563781.3889432107
537372.362371453
555387.8276053075
552853.4808642027
578602.0101159107
544090.7160675225
374017.560984898
580924.495872278
574214.5814280501
550308.3411176818
584896.9517487778
543367.733220232
539806.8876209963
549632.4377870844
549728.5955311661
560346.412179339
538944.1144990118
542175.746781127
578029.064869512
553471.4133597354
560427.3377349422
547299.1925991873
542318.9668076674
547179.2346962374
5

In [37]:
from sklearn.model_selection import RepeatedKFold

In [38]:
def eval_score(X, y, generator, y_scale=None, **kwargs):
    i = 0
    n_repeat = 5
    n_splits = 5
    rkf = RepeatedKFold(n_repeats=n_repeat, n_splits=5, random_state=i)
    underestimate = 0
    overall_speedup = 0.0
    for train_index, test_index in rkf.split(X):
        i += 1
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        regressor = generator(**kwargs, random_state=i)
        regressor.fit(X_train, y_train)
        
        y_pred = regressor.predict(X_test)
        if y_scale is not None:
            y_pred = y_scale.inverse_transform(y_pred)
        results = pd.DataFrame(y_pred, index=y_test.index, columns=["p1", "lambda1", "lambda2"])
        results['n']=list(map(lambda s: int(s.split("/")[1].replace("n","")), list(results.index)))
        
        speedups = []
        for index, row in results.iterrows():
            time = find_restart_time(row['p1'], row['lambda1'], row['lambda2'], row['n'])
            time = np.ceil(time)
            speedup = evaluate_restart_time(index, time)
            if speedup > 0.0:
                speedups.append(np.log(speedup))
            else:
                underestimate += 1
#         print(np.mean(speedups))
        overall_speedup += np.mean(speedups)
    
    print(f"Average speedup: {np.exp(overall_speedup/(n_repeat*n_splits))}, average underestimates {underestimate/(n_repeat*n_splits)}")

# Calculate average Speeup using Random Forests

In [39]:
v2 = merged_data[merged_data['p1']<1.0]
y = v2[['p1', 'lambda1', 'lambda2']]
X = v2.drop(['p1', 'p2', 'lambda1', 'lambda2'], axis=1)
eval_score(X,y, RandomForestRegressor)

Average speedup: 24.97262873666981, average underestimates 1.04


In [40]:
v2 = merged_data[merged_data['p1']<1.0]
y = v2[['p1', 'lambda1', 'lambda2']]
X = v2.drop(['p1', 'p2', 'lambda1', 'lambda2'], axis=1)

X_new = pd.DataFrame(VarianceThreshold(0.05).fit_transform(X), index=X.index)

eval_score(X_new,y, RandomForestRegressor)

Average speedup: 54.8598490920272, average underestimates 1.56


In [41]:
v2 = merged_data[merged_data['p1']<1.0]
y = v2[['p1', 'lambda1', 'lambda2']]
X = v2.drop(['p1', 'p2', 'lambda1', 'lambda2'], axis=1)

eps = 0.08
kwargs = {'max_features':0.1, 'n_estimators':5}
X_new = pd.DataFrame(VarianceThreshold(eps).fit_transform(X), index=X.index)
scaler = MinMaxScaler().fit(y)
y_new = pd.DataFrame(scaler.transform(y), index=y.index)

eval_score(X_new,y_new, RandomForestRegressor, y_scale=scaler, **kwargs)

Average speedup: 106.57578046851634, average underestimates 1.8


eps = 0.08 seems to be a good choice

max_args = 0.1 also seems to be a good choice

n_estimators = 5

## How many features are left after the Variance Threshold?

In [59]:
eps = 0.08
v2 = merged_data[merged_data['p1']<1.0]
y = v2[['p1', 'lambda1', 'lambda2']]
X = v2.drop(['p1', 'p2', 'lambda1', 'lambda2'], axis=1)
sel = VarianceThreshold(eps)
sel.fit(X)
remaining_features = X.columns[sel.get_support()]
print(len(remaining_features))
remaining_features

16


Index(['nvarsOrig', 'nclausesOrig', 'nvars', 'nclauses',
       'saps_BestSolution_Mean', 'saps_FirstLocalMinStep_Mean',
       'saps_FirstLocalMinStep_Median', 'saps_FirstLocalMinStep_Q.10',
       'saps_FirstLocalMinStep_Q.90', 'ls-saps-featuretime',
       'gsat_BestSolution_Mean', 'gsat_FirstLocalMinStep_Mean',
       'gsat_FirstLocalMinStep_Median', 'gsat_FirstLocalMinStep_Q.10',
       'gsat_FirstLocalMinStep_Q.90', 'ls-gsat-featuretime'],
      dtype='object')

In [54]:
X_new

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
barthel/n210/gen_n210_m903_k3SAT_seed2090369458.cnf,210.0,903.0,208.0,902.0,7.9249,47.2111,47.0,41.0,54.0,1.80,6.41753,47.1117,47.0,41.0,54.0,2.62
barthel/n210/gen_n210_m903_k3SAT_seed3877340932.cnf,210.0,903.0,201.0,892.0,6.9429,45.2543,45.0,39.0,52.0,1.72,5.93602,45.1390,45.0,39.0,51.0,2.68
barthel/n220/gen_n220_m946_k3SAT_seed122071483.cnf,220.0,946.0,215.0,940.0,8.3834,48.4996,48.0,42.0,55.0,1.84,7.08014,48.5424,48.0,42.0,55.0,2.78
barthel/n220/gen_n220_m946_k3SAT_seed1552099922.cnf,220.0,946.0,216.0,941.0,8.1394,48.7850,49.0,42.0,55.0,1.86,6.85880,48.8475,49.0,42.0,56.0,2.56
barthel/n220/gen_n220_m946_k3SAT_seed3935451372.cnf,220.0,946.0,216.0,942.0,6.9321,49.1149,49.0,43.0,56.0,1.88,5.97122,49.0979,49.0,43.0,56.0,2.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
qhid/n70/gen_n70_m385_k3SAT_seed2554429720.cnf,70.0,385.0,70.0,385.0,4.3237,18.8542,19.0,15.0,23.0,0.56,3.62680,18.9246,19.0,15.0,23.0,0.92
qhid/n70/gen_n70_m385_k3SAT_seed3952042576.cnf,70.0,385.0,70.0,385.0,4.8200,18.7701,19.0,15.0,23.0,0.58,3.68440,18.8377,19.0,15.0,23.0,0.98
qhid/n70/gen_n70_m385_k3SAT_seed1664766191.cnf,70.0,385.0,70.0,385.0,5.0918,18.5804,19.0,14.0,23.0,0.56,4.18990,18.5598,18.0,14.0,23.0,0.96
qhid/n70/gen_n70_m385_k3SAT_seed2538753387.cnf,70.0,385.0,69.0,384.0,5.4645,18.3046,18.0,14.0,22.0,0.54,4.24510,18.3985,18.0,14.0,22.0,0.92


In [None]:
v2 = merged_data[merged_data['p1']<1.0]
y = v2[['p1', 'lambda1', 'lambda2']]
X = v2.drop(['p1', 'p2', 'lambda1', 'lambda2'], axis=1)

X_new = MinMaxScaler().fit_transform(X)
X_new = pd.DataFrame(VarianceThreshold(0.05).fit_transform(X_new), index=X.index)
scaler = MinMaxScaler().fit(y)
y_new = pd.DataFrame(scaler.transform(y), index=y.index)

eval_score(X_new,y_new, RandomForestRegressor, y_scale=scaler)

In [None]:
MinMaxScaler().fit_transform(y)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

estimators = [('variance_removal', VarianceThreshold(0.08)), ('random_forest', RandomForestRegressor(max_features=0.1, n_estimators=5))]
pipe = Pipeline(estimators)
transformed_regressor = TransformedTargetRegressor(regressor=pipe, transformer=MinMaxScaler())

In [None]:
transformed_regressor.fit(X_train, y_train)

In [None]:
y_pred = transformed_regressor.predict(X_test)

In [None]:
speedups = []
for z in zip(y_test.iterrows(), y_pred):
    name = z[0][0]
    p = z[1][0]
    l1 = z[1][1]
    l2 = z[1][2]
    n = name.split("/")[1]
    n = int(n.replace("n", ""))
    t = find_restart_time(p, l1, l2, n)
    speedup = evaluate_restart_time(name, t)
    speedups.append(speedup)
    print(name, t, speedup)
print(gmean(speedups))