## Process CSV file into a CSV file that can be turned into an ASlib scenario: SCIP Features

In [2]:
import pandas as pd
df = pd.read_csv('C:/Users/Jasmin/Documents/MSc-Thesis/Feature_Extraction/feature_data_rp/RP_scip_dataclean_12.csv')
df.drop(columns='Unnamed: 0',inplace=True)
df.head()

Unnamed: 0,INSTANCE_NAME,VARS_presolve_default,BIN_VARS_presolve_default,CONT_VARS_presolve_default,UPPERBOUNDS_MEAN_presolve_default,UPPERBOUNDS_MIN_presolve_default,UPPERBOUNDS_MAX_presolve_default,UPPERBOUNDS_STD_presolve_default,UPPERBOUNDS_MEDIAN_presolve_default,UPPERBOUNDS_DENSITY_presolve_default,...,LP_strongbranching_2_Iterations,LP_strongbranching_2_Iter/call,LP_strongbranching_2_Iter/sec,LP_strongbranching_2_ItLimit,LP_conflictanalysis_Time,LP_conflictanalysis_Calls,LP_conflictanalysis_Iter/call,FirstLPIters_persecond,Integrals_primal-dual_Total,Integrals_primal-dual_Avg%
0,1,1437.0,330.0,1107.0,0.452455,8.326673e-17,4.014668,0.208556,0.1,0.999304,...,9068.0,503.78,3943.55,18.0,0.0,1.0,0.0,10024.94,2257.16,100.0
1,10,1403.0,313.0,1090.0,0.459198,8.326673e-17,4.514678,0.241107,0.1,0.996436,...,34576.0,501.1,3416.29,69.0,0.0,0.0,0.0,6630.76,1954.25,100.0
2,100,1399.0,311.0,1088.0,0.432643,8.326673e-17,4.140987,0.204747,0.1,0.99857,...,15531.0,501.0,3656.85,31.0,0.0,0.0,0.0,6675.08,3152.15,100.0
3,1000,1443.0,333.0,1110.0,0.466921,8.326673e-17,4.464831,0.221774,0.1,0.999307,...,7733.0,483.31,3250.74,14.0,0.0,1.0,0.0,6334.76,3848.17,100.0
4,10000,1329.0,276.0,1053.0,0.423002,0.003921569,3.856785,0.220129,0.1,1.0,...,18120.0,503.33,3253.58,36.0,0.0,0.0,0.0,5678.14,1671.32,100.0


In [3]:
df['INSTANCE_NAME'] = 'mip' + df['INSTANCE_NAME'].astype('str')
df.head()

Unnamed: 0,INSTANCE_NAME,VARS_presolve_default,BIN_VARS_presolve_default,CONT_VARS_presolve_default,UPPERBOUNDS_MEAN_presolve_default,UPPERBOUNDS_MIN_presolve_default,UPPERBOUNDS_MAX_presolve_default,UPPERBOUNDS_STD_presolve_default,UPPERBOUNDS_MEDIAN_presolve_default,UPPERBOUNDS_DENSITY_presolve_default,...,LP_strongbranching_2_Iterations,LP_strongbranching_2_Iter/call,LP_strongbranching_2_Iter/sec,LP_strongbranching_2_ItLimit,LP_conflictanalysis_Time,LP_conflictanalysis_Calls,LP_conflictanalysis_Iter/call,FirstLPIters_persecond,Integrals_primal-dual_Total,Integrals_primal-dual_Avg%
0,mip1,1437.0,330.0,1107.0,0.452455,8.326673e-17,4.014668,0.208556,0.1,0.999304,...,9068.0,503.78,3943.55,18.0,0.0,1.0,0.0,10024.94,2257.16,100.0
1,mip10,1403.0,313.0,1090.0,0.459198,8.326673e-17,4.514678,0.241107,0.1,0.996436,...,34576.0,501.1,3416.29,69.0,0.0,0.0,0.0,6630.76,1954.25,100.0
2,mip100,1399.0,311.0,1088.0,0.432643,8.326673e-17,4.140987,0.204747,0.1,0.99857,...,15531.0,501.0,3656.85,31.0,0.0,0.0,0.0,6675.08,3152.15,100.0
3,mip1000,1443.0,333.0,1110.0,0.466921,8.326673e-17,4.464831,0.221774,0.1,0.999307,...,7733.0,483.31,3250.74,14.0,0.0,1.0,0.0,6334.76,3848.17,100.0
4,mip10000,1329.0,276.0,1053.0,0.423002,0.003921569,3.856785,0.220129,0.1,1.0,...,18120.0,503.33,3253.58,36.0,0.0,0.0,0.0,5678.14,1671.32,100.0


## What timing features shall we use? 

- Dynamic feature group: df['TotalTime'] - df['presolvingtime'] (presolving group is a requirement)
- Static presolve on (default) feature group: df['presolvingtime']
- Static presolve off feature group: assume they are free because they are simply statistics like 'number of X' or 'mean of X'.

### Add rows as comma seperated data to feature_values.arff file

In [4]:
# first make sure all column names are correct and isolate timing features
timing_df = df[['presolvingtime', 'TotalTime']]
timing_df

Unnamed: 0,presolvingtime,TotalTime
0,1.47,22.74
1,1.31,19.67
2,1.40,31.65
3,2.53,38.64
4,1.94,16.86
...,...,...
9995,1.18,11.67
9996,1.08,10.84
9997,1.27,12.71
9998,1.44,19.74


In [5]:
# first make sure all column names are correct and isolate timing features
features_df = df.drop(columns=timing_df.columns)
features_df

Unnamed: 0,INSTANCE_NAME,VARS_presolve_default,BIN_VARS_presolve_default,CONT_VARS_presolve_default,UPPERBOUNDS_MEAN_presolve_default,UPPERBOUNDS_MIN_presolve_default,UPPERBOUNDS_MAX_presolve_default,UPPERBOUNDS_STD_presolve_default,UPPERBOUNDS_MEDIAN_presolve_default,UPPERBOUNDS_DENSITY_presolve_default,...,LP_strongbranching_2_Iterations,LP_strongbranching_2_Iter/call,LP_strongbranching_2_Iter/sec,LP_strongbranching_2_ItLimit,LP_conflictanalysis_Time,LP_conflictanalysis_Calls,LP_conflictanalysis_Iter/call,FirstLPIters_persecond,Integrals_primal-dual_Total,Integrals_primal-dual_Avg%
0,mip1,1437.0,330.0,1107.0,0.452455,8.326673e-17,4.014668,0.208556,0.1,0.999304,...,9068.0,503.78,3943.55,18.0,0.0,1.0,0.0,10024.94,2257.16,100.0
1,mip10,1403.0,313.0,1090.0,0.459198,8.326673e-17,4.514678,0.241107,0.1,0.996436,...,34576.0,501.10,3416.29,69.0,0.0,0.0,0.0,6630.76,1954.25,100.0
2,mip100,1399.0,311.0,1088.0,0.432643,8.326673e-17,4.140987,0.204747,0.1,0.998570,...,15531.0,501.00,3656.85,31.0,0.0,0.0,0.0,6675.08,3152.15,100.0
3,mip1000,1443.0,333.0,1110.0,0.466921,8.326673e-17,4.464831,0.221774,0.1,0.999307,...,7733.0,483.31,3250.74,14.0,0.0,1.0,0.0,6334.76,3848.17,100.0
4,mip10000,1329.0,276.0,1053.0,0.423002,3.921569e-03,3.856785,0.220129,0.1,1.000000,...,18120.0,503.33,3253.58,36.0,0.0,0.0,0.0,5678.14,1671.32,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,mip9995,1375.0,299.0,1076.0,0.441397,8.326673e-17,3.593675,0.207536,0.1,0.999273,...,11526.0,501.13,4042.59,23.0,0.0,0.0,0.0,7945.45,1155.12,100.0
9996,mip9996,1275.0,249.0,1026.0,0.380774,8.326673e-17,3.215775,0.189279,0.1,0.904314,...,9689.0,509.95,3498.07,19.0,0.0,0.0,0.0,12400.55,1059.17,100.0
9997,mip9997,1379.0,301.0,1078.0,0.418347,1.347221e-04,3.886279,0.207325,0.1,1.000000,...,10107.0,505.35,2445.63,20.0,0.0,0.0,0.0,7913.33,1258.05,100.0
9998,mip9998,1365.0,294.0,1071.0,0.430596,8.326673e-17,4.260940,0.212202,0.1,0.999267,...,11689.0,508.22,4380.38,22.0,0.0,0.0,0.0,6605.43,1954.18,100.0


In [6]:
# first make sure all column names are correct --> instance id and repetitions
features_df.rename(columns={"INSTANCE_NAME":"instance_id"}, inplace=True)
features_df["repetition"] = 1
features_df

Unnamed: 0,instance_id,VARS_presolve_default,BIN_VARS_presolve_default,CONT_VARS_presolve_default,UPPERBOUNDS_MEAN_presolve_default,UPPERBOUNDS_MIN_presolve_default,UPPERBOUNDS_MAX_presolve_default,UPPERBOUNDS_STD_presolve_default,UPPERBOUNDS_MEDIAN_presolve_default,UPPERBOUNDS_DENSITY_presolve_default,...,LP_strongbranching_2_Iter/call,LP_strongbranching_2_Iter/sec,LP_strongbranching_2_ItLimit,LP_conflictanalysis_Time,LP_conflictanalysis_Calls,LP_conflictanalysis_Iter/call,FirstLPIters_persecond,Integrals_primal-dual_Total,Integrals_primal-dual_Avg%,repetition
0,mip1,1437.0,330.0,1107.0,0.452455,8.326673e-17,4.014668,0.208556,0.1,0.999304,...,503.78,3943.55,18.0,0.0,1.0,0.0,10024.94,2257.16,100.0,1
1,mip10,1403.0,313.0,1090.0,0.459198,8.326673e-17,4.514678,0.241107,0.1,0.996436,...,501.10,3416.29,69.0,0.0,0.0,0.0,6630.76,1954.25,100.0,1
2,mip100,1399.0,311.0,1088.0,0.432643,8.326673e-17,4.140987,0.204747,0.1,0.998570,...,501.00,3656.85,31.0,0.0,0.0,0.0,6675.08,3152.15,100.0,1
3,mip1000,1443.0,333.0,1110.0,0.466921,8.326673e-17,4.464831,0.221774,0.1,0.999307,...,483.31,3250.74,14.0,0.0,1.0,0.0,6334.76,3848.17,100.0,1
4,mip10000,1329.0,276.0,1053.0,0.423002,3.921569e-03,3.856785,0.220129,0.1,1.000000,...,503.33,3253.58,36.0,0.0,0.0,0.0,5678.14,1671.32,100.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,mip9995,1375.0,299.0,1076.0,0.441397,8.326673e-17,3.593675,0.207536,0.1,0.999273,...,501.13,4042.59,23.0,0.0,0.0,0.0,7945.45,1155.12,100.0,1
9996,mip9996,1275.0,249.0,1026.0,0.380774,8.326673e-17,3.215775,0.189279,0.1,0.904314,...,509.95,3498.07,19.0,0.0,0.0,0.0,12400.55,1059.17,100.0,1
9997,mip9997,1379.0,301.0,1078.0,0.418347,1.347221e-04,3.886279,0.207325,0.1,1.000000,...,505.35,2445.63,20.0,0.0,0.0,0.0,7913.33,1258.05,100.0,1
9998,mip9998,1365.0,294.0,1071.0,0.430596,8.326673e-17,4.260940,0.212202,0.1,0.999267,...,508.22,4380.38,22.0,0.0,0.0,0.0,6605.43,1954.18,100.0,1


In [7]:
# Change position of columns --> instance_id and repetition go first
cols = features_df.columns.tolist()
cols

['instance_id',
 'VARS_presolve_default',
 'BIN_VARS_presolve_default',
 'CONT_VARS_presolve_default',
 'UPPERBOUNDS_MEAN_presolve_default',
 'UPPERBOUNDS_MIN_presolve_default',
 'UPPERBOUNDS_MAX_presolve_default',
 'UPPERBOUNDS_STD_presolve_default',
 'UPPERBOUNDS_MEDIAN_presolve_default',
 'UPPERBOUNDS_DENSITY_presolve_default',
 'LOWERBOUNDS_MEAN_presolve_default',
 'LOWERBOUNDS_MIN_presolve_default',
 'LOWERBOUNDS_MAX_presolve_default',
 'LOWERBOUNDS_STD_presolve_default',
 'LOWERBOUNDS_DENSITY_presolve_default',
 'BOUNDRANGE_MEAN_presolve_default',
 'BOUNDRANGE_MIN_presolve_default',
 'BOUNDRANGE_MAX_presolve_default',
 'BOUNDRANGE_STD_presolve_default',
 'BOUNDRANGE_MEDIAN_presolve_default',
 'OBJCOEFF_ALL_MEAN_presolve_default',
 'OBJCOEFF_ALL_STD_presolve_default',
 'OBJCOEFF_CONT_MEAN_presolve_default',
 'OBJCOEFF_CONT_STD_presolve_default',
 'CONSTR_presolve_default',
 'LINSING_CONSTR_presolve_default',
 'LINPREC_CONSTR_presolve_default',
 'LINVARBD_CONSTR_presolve_default',


In [8]:
# Change position of columns --> instance_id and repetition go first
cols = cols[-1:] + cols[1:-1]
new_cols = ['instance_id']
new_cols = new_cols + cols
new_cols

['instance_id',
 'repetition',
 'VARS_presolve_default',
 'BIN_VARS_presolve_default',
 'CONT_VARS_presolve_default',
 'UPPERBOUNDS_MEAN_presolve_default',
 'UPPERBOUNDS_MIN_presolve_default',
 'UPPERBOUNDS_MAX_presolve_default',
 'UPPERBOUNDS_STD_presolve_default',
 'UPPERBOUNDS_MEDIAN_presolve_default',
 'UPPERBOUNDS_DENSITY_presolve_default',
 'LOWERBOUNDS_MEAN_presolve_default',
 'LOWERBOUNDS_MIN_presolve_default',
 'LOWERBOUNDS_MAX_presolve_default',
 'LOWERBOUNDS_STD_presolve_default',
 'LOWERBOUNDS_DENSITY_presolve_default',
 'BOUNDRANGE_MEAN_presolve_default',
 'BOUNDRANGE_MIN_presolve_default',
 'BOUNDRANGE_MAX_presolve_default',
 'BOUNDRANGE_STD_presolve_default',
 'BOUNDRANGE_MEDIAN_presolve_default',
 'OBJCOEFF_ALL_MEAN_presolve_default',
 'OBJCOEFF_ALL_STD_presolve_default',
 'OBJCOEFF_CONT_MEAN_presolve_default',
 'OBJCOEFF_CONT_STD_presolve_default',
 'CONSTR_presolve_default',
 'LINSING_CONSTR_presolve_default',
 'LINPREC_CONSTR_presolve_default',
 'LINVARBD_CONSTR_pres

In [9]:
features_df = features_df[new_cols]
features_df

Unnamed: 0,instance_id,repetition,VARS_presolve_default,BIN_VARS_presolve_default,CONT_VARS_presolve_default,UPPERBOUNDS_MEAN_presolve_default,UPPERBOUNDS_MIN_presolve_default,UPPERBOUNDS_MAX_presolve_default,UPPERBOUNDS_STD_presolve_default,UPPERBOUNDS_MEDIAN_presolve_default,...,LP_strongbranching_2_Iterations,LP_strongbranching_2_Iter/call,LP_strongbranching_2_Iter/sec,LP_strongbranching_2_ItLimit,LP_conflictanalysis_Time,LP_conflictanalysis_Calls,LP_conflictanalysis_Iter/call,FirstLPIters_persecond,Integrals_primal-dual_Total,Integrals_primal-dual_Avg%
0,mip1,1,1437.0,330.0,1107.0,0.452455,8.326673e-17,4.014668,0.208556,0.1,...,9068.0,503.78,3943.55,18.0,0.0,1.0,0.0,10024.94,2257.16,100.0
1,mip10,1,1403.0,313.0,1090.0,0.459198,8.326673e-17,4.514678,0.241107,0.1,...,34576.0,501.10,3416.29,69.0,0.0,0.0,0.0,6630.76,1954.25,100.0
2,mip100,1,1399.0,311.0,1088.0,0.432643,8.326673e-17,4.140987,0.204747,0.1,...,15531.0,501.00,3656.85,31.0,0.0,0.0,0.0,6675.08,3152.15,100.0
3,mip1000,1,1443.0,333.0,1110.0,0.466921,8.326673e-17,4.464831,0.221774,0.1,...,7733.0,483.31,3250.74,14.0,0.0,1.0,0.0,6334.76,3848.17,100.0
4,mip10000,1,1329.0,276.0,1053.0,0.423002,3.921569e-03,3.856785,0.220129,0.1,...,18120.0,503.33,3253.58,36.0,0.0,0.0,0.0,5678.14,1671.32,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,mip9995,1,1375.0,299.0,1076.0,0.441397,8.326673e-17,3.593675,0.207536,0.1,...,11526.0,501.13,4042.59,23.0,0.0,0.0,0.0,7945.45,1155.12,100.0
9996,mip9996,1,1275.0,249.0,1026.0,0.380774,8.326673e-17,3.215775,0.189279,0.1,...,9689.0,509.95,3498.07,19.0,0.0,0.0,0.0,12400.55,1059.17,100.0
9997,mip9997,1,1379.0,301.0,1078.0,0.418347,1.347221e-04,3.886279,0.207325,0.1,...,10107.0,505.35,2445.63,20.0,0.0,0.0,0.0,7913.33,1258.05,100.0
9998,mip9998,1,1365.0,294.0,1071.0,0.430596,8.326673e-17,4.260940,0.212202,0.1,...,11689.0,508.22,4380.38,22.0,0.0,0.0,0.0,6605.43,1954.18,100.0


In [10]:
id_order = list(features_df.instance_id)

In [11]:
test_data = {'test1':[1,2,3],'test2':[4,5,6],'test3':['a','b','c']}
example_test_df = pd.DataFrame(test_data)
example_test_df

Unnamed: 0,test1,test2,test3
0,1,4,a
1,2,5,b
2,3,6,c


In [12]:
for index,row in example_test_df.iterrows():
    print(str(list(row)).strip("[]").replace("'", ""))

1, 4, a
2, 5, b
3, 6, c


In [31]:
# @ATTRIBUTE instance_id STRING
# Add all attributes in correct order

with open("feature_values.arff","a") as file:
    for col in features_df.columns:
        file.write("\n")
        if col == 'instance_id':
            newrow = '@ATTRIBUTE ' + col + ' STRING'
        else:
            newrow = '@ATTRIBUTE ' + col + ' NUMERIC'
        file.write(newrow)

    file.close()


In [60]:
# Get comma seperated rows and iteratively add to feature_values.arff
with open("MIPVerifySDP_SCIPfeatureset/feature_values.arff","a") as file:
    for index,row in features_df.iterrows():
        file.write("\n")
        newrow = str(list(row)).strip("[]").replace("'", "")
        file.write(newrow)

    file.close()

In [13]:
# double check the column order
for i in features_df.columns:
    print(i)

instance_id
repetition
VARS_presolve_default
BIN_VARS_presolve_default
CONT_VARS_presolve_default
UPPERBOUNDS_MEAN_presolve_default
UPPERBOUNDS_MIN_presolve_default
UPPERBOUNDS_MAX_presolve_default
UPPERBOUNDS_STD_presolve_default
UPPERBOUNDS_MEDIAN_presolve_default
UPPERBOUNDS_DENSITY_presolve_default
LOWERBOUNDS_MEAN_presolve_default
LOWERBOUNDS_MIN_presolve_default
LOWERBOUNDS_MAX_presolve_default
LOWERBOUNDS_STD_presolve_default
LOWERBOUNDS_DENSITY_presolve_default
BOUNDRANGE_MEAN_presolve_default
BOUNDRANGE_MIN_presolve_default
BOUNDRANGE_MAX_presolve_default
BOUNDRANGE_STD_presolve_default
BOUNDRANGE_MEDIAN_presolve_default
OBJCOEFF_ALL_MEAN_presolve_default
OBJCOEFF_ALL_STD_presolve_default
OBJCOEFF_CONT_MEAN_presolve_default
OBJCOEFF_CONT_STD_presolve_default
CONSTR_presolve_default
LINSING_CONSTR_presolve_default
LINPREC_CONSTR_presolve_default
LINVARBD_CONSTR_presolve_default
LINMIXBIN_CONSTR_presolve_default
CONSTR_MEAN_presolve_default
CONSTR_MAX_presolve_default
CONSTR_STD

In [24]:
import re

_RE_ATTRIBUTE = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE)

for col in features_df.columns:
    if col == 'instance_id':
        s = '@ATTRIBUTE ' + col + ' STRING'
    else:
        s = '@ATTRIBUTE ' + col + ' NUMERIC'
    _, v = s.split(' ', 1)
    v = v.strip()
    m = _RE_ATTRIBUTE.match(v)
    if not m:
        print('fail')
        print(v)
    else:
        print('pass')
        print(v)




pass
instance_id STRING
pass
repetition NUMERIC
pass
VARS_presolve_default NUMERIC
pass
BIN_VARS_presolve_default NUMERIC
pass
CONT_VARS_presolve_default NUMERIC
pass
UPPERBOUNDS_MEAN_presolve_default NUMERIC
pass
UPPERBOUNDS_MIN_presolve_default NUMERIC
pass
UPPERBOUNDS_MAX_presolve_default NUMERIC
pass
UPPERBOUNDS_STD_presolve_default NUMERIC
pass
UPPERBOUNDS_MEDIAN_presolve_default NUMERIC
pass
UPPERBOUNDS_DENSITY_presolve_default NUMERIC
pass
LOWERBOUNDS_MEAN_presolve_default NUMERIC
pass
LOWERBOUNDS_MIN_presolve_default NUMERIC
pass
LOWERBOUNDS_MAX_presolve_default NUMERIC
pass
LOWERBOUNDS_STD_presolve_default NUMERIC
pass
LOWERBOUNDS_DENSITY_presolve_default NUMERIC
pass
BOUNDRANGE_MEAN_presolve_default NUMERIC
pass
BOUNDRANGE_MIN_presolve_default NUMERIC
pass
BOUNDRANGE_MAX_presolve_default NUMERIC
pass
BOUNDRANGE_STD_presolve_default NUMERIC
pass
BOUNDRANGE_MEDIAN_presolve_default NUMERIC
pass
OBJCOEFF_ALL_MEAN_presolve_default NUMERIC
pass
OBJCOEFF_ALL_STD_presolve_default NUM

### Create description.txt

In [14]:
for i in features_df.columns:
    if 'presolve_off' in i:
        print('- ' + i)

    else:
        pass

- VARS_presolve_off
- BIN_VARS_presolve_off
- CONT_VARS_presolve_off
- PER_FINITE_UB_presolve_off
- UPPERBOUNDS_MEAN_presolve_off
- UPPERBOUNDS_MIN_presolve_off
- UPPERBOUNDS_STD_presolve_off
- LOWERBOUNDS_MEAN_presolve_off
- LOWERBOUNDS_MIN_presolve_off
- LOWERBOUNDS_MAX_presolve_off
- LOWERBOUNDS_STD_presolve_off
- LOWERBOUNDS_DENSITY_presolve_off
- BOUNDRANGE_MEAN_presolve_off
- BOUNDRANGE_MAX_presolve_off
- BOUNDRANGE_STD_presolve_off
- BOUNDRANGE_MEDIAN_presolve_off
- OBJCOEFF_ALL_MEAN_presolve_off
- OBJCOEFF_ALL_STD_presolve_off
- OBJCOEFF_CONT_MEAN_presolve_off
- CONSTR_presolve_off
- LINSING_CONSTR_presolve_off
- LINVARBD_CONSTR_presolve_off
- LINMIXBIN_CONSTR_presolve_off
- CONSTR_MEAN_presolve_off
- CONSTR_MAX_presolve_off
- CONSTR_STD_presolve_off
- RH_CONSTR_MEAN_presolve_off
- RH_CONSTR_MIN_presolve_off
- RH_CONSTR_MAX_presolve_off
- RH_CONSTR_STD_presolve_off
- RH_CONSTR_DENSITY_presolve_off
- LH_CONSTR_MEAN_presolve_off
- LH_CONSTR_MIN_presolve_off
- LH_CONSTR_STD_presol

In [15]:
for i in features_df.columns:
    if 'presolve_default' in i:
        print('- ' + i)

    else:
        pass

- VARS_presolve_default
- BIN_VARS_presolve_default
- CONT_VARS_presolve_default
- UPPERBOUNDS_MEAN_presolve_default
- UPPERBOUNDS_MIN_presolve_default
- UPPERBOUNDS_MAX_presolve_default
- UPPERBOUNDS_STD_presolve_default
- UPPERBOUNDS_MEDIAN_presolve_default
- UPPERBOUNDS_DENSITY_presolve_default
- LOWERBOUNDS_MEAN_presolve_default
- LOWERBOUNDS_MIN_presolve_default
- LOWERBOUNDS_MAX_presolve_default
- LOWERBOUNDS_STD_presolve_default
- LOWERBOUNDS_DENSITY_presolve_default
- BOUNDRANGE_MEAN_presolve_default
- BOUNDRANGE_MIN_presolve_default
- BOUNDRANGE_MAX_presolve_default
- BOUNDRANGE_STD_presolve_default
- BOUNDRANGE_MEDIAN_presolve_default
- OBJCOEFF_ALL_MEAN_presolve_default
- OBJCOEFF_ALL_STD_presolve_default
- OBJCOEFF_CONT_MEAN_presolve_default
- OBJCOEFF_CONT_STD_presolve_default
- CONSTR_presolve_default
- LINSING_CONSTR_presolve_default
- LINPREC_CONSTR_presolve_default
- LINVARBD_CONSTR_presolve_default
- LINMIXBIN_CONSTR_presolve_default
- CONSTR_MEAN_presolve_default
- C

In [16]:
for i in features_df.columns:
    print('- ' + i)

- repetition
- instance_id
- VARS_presolve_default
- BIN_VARS_presolve_default
- CONT_VARS_presolve_default
- UPPERBOUNDS_MEAN_presolve_default
- UPPERBOUNDS_MIN_presolve_default
- UPPERBOUNDS_MAX_presolve_default
- UPPERBOUNDS_STD_presolve_default
- UPPERBOUNDS_MEDIAN_presolve_default
- UPPERBOUNDS_DENSITY_presolve_default
- LOWERBOUNDS_MEAN_presolve_default
- LOWERBOUNDS_MIN_presolve_default
- LOWERBOUNDS_MAX_presolve_default
- LOWERBOUNDS_STD_presolve_default
- LOWERBOUNDS_DENSITY_presolve_default
- BOUNDRANGE_MEAN_presolve_default
- BOUNDRANGE_MIN_presolve_default
- BOUNDRANGE_MAX_presolve_default
- BOUNDRANGE_STD_presolve_default
- BOUNDRANGE_MEDIAN_presolve_default
- OBJCOEFF_ALL_MEAN_presolve_default
- OBJCOEFF_ALL_STD_presolve_default
- OBJCOEFF_CONT_MEAN_presolve_default
- OBJCOEFF_CONT_STD_presolve_default
- CONSTR_presolve_default
- LINSING_CONSTR_presolve_default
- LINPREC_CONSTR_presolve_default
- LINVARBD_CONSTR_presolve_default
- LINMIXBIN_CONSTR_presolve_default
- CONST

### Make the feature_runstatus.arff file

In [62]:
runstatus_df = pd.DataFrame({'instance_id': id_order,'repetition':[1]*10000, 'StaticOn':['ok']*10000, 
                             'StaticOff':['ok']*10000,'Dynamic':['ok']*10000})

runstatus_df

Unnamed: 0,instance_id,repetition,StaticOn,StaticOff,Dynamic
0,mip1,1,ok,ok,ok
1,mip10,1,ok,ok,ok
2,mip100,1,ok,ok,ok
3,mip1000,1,ok,ok,ok
4,mip10000,1,ok,ok,ok
...,...,...,...,...,...
9995,mip9995,1,ok,ok,ok
9996,mip9996,1,ok,ok,ok
9997,mip9997,1,ok,ok,ok
9998,mip9998,1,ok,ok,ok


In [63]:
with open("MIPVerifySDP_SCIPfeatureset/feature_runstatus.arff","a") as file:
    for index,row in runstatus_df.iterrows():
        file.write("\n")
        newrow = str(list(row)).strip("[]").replace("'", "")
        file.write(newrow)

    file.close()

### Now make feature_costs.arff file

In [64]:
costs_df = pd.DataFrame({'instance_id': id_order, 'repetition':[1]*10000})

costs_df

Unnamed: 0,instance_id,repetition
0,mip1,1
1,mip10,1
2,mip100,1
3,mip1000,1
4,mip10000,1
...,...,...
9995,mip9995,1
9996,mip9996,1
9997,mip9997,1
9998,mip9998,1


In [65]:
costs_df = pd.concat([costs_df,timing_df['presolvingtime']],axis=1)
costs_df['time_StaticOff'] = 0
costs_df

Unnamed: 0,instance_id,repetition,presolvingtime,time_StaticOff
0,mip1,1,1.47,0
1,mip10,1,1.31,0
2,mip100,1,1.40,0
3,mip1000,1,2.53,0
4,mip10000,1,1.94,0
...,...,...,...,...
9995,mip9995,1,1.18,0
9996,mip9996,1,1.08,0
9997,mip9997,1,1.27,0
9998,mip9998,1,1.44,0


In [66]:
costs_df['time_dynamic'] = timing_df['TotalTime'] - timing_df['presolvingtime']
costs_df

Unnamed: 0,instance_id,repetition,presolvingtime,time_StaticOff,time_dynamic
0,mip1,1,1.47,0,21.27
1,mip10,1,1.31,0,18.36
2,mip100,1,1.40,0,30.25
3,mip1000,1,2.53,0,36.11
4,mip10000,1,1.94,0,14.92
...,...,...,...,...,...
9995,mip9995,1,1.18,0,10.49
9996,mip9996,1,1.08,0,9.76
9997,mip9997,1,1.27,0,11.44
9998,mip9998,1,1.44,0,18.30


In [67]:
with open("MIPVerifySDP_SCIPfeatureset/feature_costs.arff","a") as file:
    for index,row in costs_df.iterrows():
        file.write("\n")
        newrow = str(list(row)).strip("[]").replace("'", "")
        file.write(newrow)

    file.close()

### Next step is creating algorithm_runs.arff

In [68]:
runtimes = pd.read_excel('C:/Users/Jasmin/Documents/MSc-Thesis/Data/Runtime/Results-MIPVerify_SDPdMLPa.xlsx')
runtimes

Unnamed: 0,SampleNumber,SolveTime_default,SolveStatus_default,SolveTime_conf1,SolveStatus_conf1,SolveTime_conf2,SolveStatus_conf2,SolveTime_conf3,SolveStatus_conf3,SolveTime_best,Config_best
0,1,104.393417,InfeasibleOrUnbounded,152.683291,InfeasibleOrUnbounded,44.068493,Infeasible,78.731996,Infeasible,44.068493,SolveTime_conf2
1,2,9600.323414,UserLimit,9600.080699,UserLimit,9600.184597,UserLimit,9600.010288,UserLimit,9600.010288,TIMEOUT
2,3,9600.157577,UserLimit,1258.149140,InfeasibleOrUnbounded,3025.400426,Infeasible,2938.074613,Infeasible,1258.149140,SolveTime_conf1
3,4,83.344657,InfeasibleOrUnbounded,170.613330,InfeasibleOrUnbounded,54.733004,Infeasible,41.134581,Infeasible,41.134581,SolveTime_conf3
4,5,5567.992290,InfeasibleOrUnbounded,220.795008,InfeasibleOrUnbounded,538.053177,Infeasible,300.046997,Infeasible,220.795008,SolveTime_conf1
...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,12.674720,InfeasibleOrUnbounded,18608.097787,UserLimit,25.863838,Infeasible,172.116870,Infeasible,12.674720,SolveTime_default
9996,9997,18.379950,InfeasibleOrUnbounded,110.131027,InfeasibleOrUnbounded,50.209194,Infeasible,110.876559,Infeasible,18.379950,SolveTime_default
9997,9998,75.684641,InfeasibleOrUnbounded,105.670172,InfeasibleOrUnbounded,106.346014,Infeasible,102.811690,Infeasible,75.684641,SolveTime_default
9998,9999,1669.967257,InfeasibleOrUnbounded,826.113838,InfeasibleOrUnbounded,1401.266455,Infeasible,1643.462584,Infeasible,826.113838,SolveTime_conf1


In [69]:
runtimes.rename(columns={'SampleNumber':'instance_id'},inplace=True)

In [70]:
runtimes['instance_id'] = 'mip' + runtimes['instance_id'].astype('str')
runtimes.head()

Unnamed: 0,instance_id,SolveTime_default,SolveStatus_default,SolveTime_conf1,SolveStatus_conf1,SolveTime_conf2,SolveStatus_conf2,SolveTime_conf3,SolveStatus_conf3,SolveTime_best,Config_best
0,mip1,104.393417,InfeasibleOrUnbounded,152.683291,InfeasibleOrUnbounded,44.068493,Infeasible,78.731996,Infeasible,44.068493,SolveTime_conf2
1,mip2,9600.323414,UserLimit,9600.080699,UserLimit,9600.184597,UserLimit,9600.010288,UserLimit,9600.010288,TIMEOUT
2,mip3,9600.157577,UserLimit,1258.14914,InfeasibleOrUnbounded,3025.400426,Infeasible,2938.074613,Infeasible,1258.14914,SolveTime_conf1
3,mip4,83.344657,InfeasibleOrUnbounded,170.61333,InfeasibleOrUnbounded,54.733004,Infeasible,41.134581,Infeasible,41.134581,SolveTime_conf3
4,mip5,5567.99229,InfeasibleOrUnbounded,220.795008,InfeasibleOrUnbounded,538.053177,Infeasible,300.046997,Infeasible,220.795008,SolveTime_conf1


In [71]:
runtimes = runtimes.drop(columns=['Config_best','SolveTime_best','SolveStatus_default','SolveStatus_conf1','SolveStatus_conf2','SolveStatus_conf3'])
runtimes.head()

Unnamed: 0,instance_id,SolveTime_default,SolveTime_conf1,SolveTime_conf2,SolveTime_conf3
0,mip1,104.393417,152.683291,44.068493,78.731996
1,mip2,9600.323414,9600.080699,9600.184597,9600.010288
2,mip3,9600.157577,1258.14914,3025.400426,2938.074613
3,mip4,83.344657,170.61333,54.733004,41.134581
4,mip5,5567.99229,220.795008,538.053177,300.046997


In [72]:
runtimes['SolveTime_default'][runtimes['SolveTime_default'] >= 9600] = 9600
runtimes['SolveTime_conf1'][runtimes['SolveTime_conf1'] >= 9600] = 9600
runtimes['SolveTime_conf2'][runtimes['SolveTime_conf2'] >= 9600] = 9600
runtimes['SolveTime_conf3'][runtimes['SolveTime_conf3'] >= 9600] = 9600
runtimes.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  runtimes['SolveTime_default'][runtimes['SolveTime_default'] >= 9600] = 9600
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  runtimes['SolveTime_conf1'][runtimes['SolveTime_conf1'] >= 9600] = 9600
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  runtimes['SolveTime_conf2'][runtimes['SolveTime_conf2'] >= 9600] = 9600
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

Unnamed: 0,instance_id,SolveTime_default,SolveTime_conf1,SolveTime_conf2,SolveTime_conf3
0,mip1,104.393417,152.683291,44.068493,78.731996
1,mip2,9600.0,9600.0,9600.0,9600.0
2,mip3,9600.0,1258.14914,3025.400426,2938.074613
3,mip4,83.344657,170.61333,54.733004,41.134581
4,mip5,5567.99229,220.795008,538.053177,300.046997


In [73]:
runtimes.sort_values(by='instance_id', inplace=True)
runtimes.head()

Unnamed: 0,instance_id,SolveTime_default,SolveTime_conf1,SolveTime_conf2,SolveTime_conf3
0,mip1,104.393417,152.683291,44.068493,78.731996
9,mip10,9600.0,9600.0,9600.0,9600.0
99,mip100,89.865928,779.582903,80.580781,215.399513
999,mip1000,9600.0,9600.0,9600.0,9600.0
9999,mip10000,22.112561,378.362607,78.919991,18.838446


In [74]:
new_runtime = runtimes.stack().reset_index()
new_runtime

Unnamed: 0,level_0,level_1,0
0,0,instance_id,mip1
1,0,SolveTime_default,104.393417
2,0,SolveTime_conf1,152.683291
3,0,SolveTime_conf2,44.068493
4,0,SolveTime_conf3,78.731996
...,...,...,...
49995,9998,instance_id,mip9999
49996,9998,SolveTime_default,1669.967257
49997,9998,SolveTime_conf1,826.113838
49998,9998,SolveTime_conf2,1401.266455


In [75]:
temp = {'instance_id':[], 'repetition':[1]*10000*4, 'algorithm':[],'par10':[],'runstatus':[]}
for index,row in new_runtime.iterrows():
    if row['level_1'] == 'instance_id':
        temp['instance_id'].append(row[0])
        temp['instance_id'].append(row[0])
        temp['instance_id'].append(row[0])
        temp['instance_id'].append(row[0])

    else:
        temp['algorithm'].append(row['level_1'])
        temp['par10'].append(row[0])
        if row[0] == 9600:
            temp['runstatus'].append('timeout')
        else:
            temp['runstatus'].append('ok')

In [76]:
algo_runs_df = pd.DataFrame(temp)
algo_runs_df

Unnamed: 0,instance_id,repetition,algorithm,par10,runstatus
0,mip1,1,SolveTime_default,104.393417,ok
1,mip1,1,SolveTime_conf1,152.683291,ok
2,mip1,1,SolveTime_conf2,44.068493,ok
3,mip1,1,SolveTime_conf3,78.731996,ok
4,mip10,1,SolveTime_default,9600.000000,timeout
...,...,...,...,...,...
39995,mip9998,1,SolveTime_conf3,102.811690,ok
39996,mip9999,1,SolveTime_default,1669.967257,ok
39997,mip9999,1,SolveTime_conf1,826.113838,ok
39998,mip9999,1,SolveTime_conf2,1401.266455,ok


In [77]:
algo_runs_df.loc[algo_runs_df["algorithm"] == "SolveTime_default", "algorithm"] = 'ConfigDefault'

In [78]:
algo_runs_df.loc[algo_runs_df["algorithm"] == "SolveTime_conf1", "algorithm"] = 'Config1'
algo_runs_df.loc[algo_runs_df["algorithm"] == "SolveTime_conf2", "algorithm"] = 'Config2'
algo_runs_df.loc[algo_runs_df["algorithm"] == "SolveTime_conf3", "algorithm"] = 'Config3'
algo_runs_df.head()

Unnamed: 0,instance_id,repetition,algorithm,par10,runstatus
0,mip1,1,ConfigDefault,104.393417,ok
1,mip1,1,Config1,152.683291,ok
2,mip1,1,Config2,44.068493,ok
3,mip1,1,Config3,78.731996,ok
4,mip10,1,ConfigDefault,9600.0,timeout


In [79]:
with open("MIPVerifySDP_SCIPfeatureset/algorithm_runs.arff","a") as file:
    for index,row in algo_runs_df.iterrows():
        file.write("\n")
        newrow = str(list(row)).strip("[]").replace("'", "")
        file.write(newrow)

    file.close()

### Last one.... make cv.arff

In [80]:
import random

In [81]:
# This script specifies which instances are in which of the 10 folds for 10-fold cross validation
folds_df = pd.DataFrame({'instance_id': id_order, 'repetition':[1]*10000})
folds_df

Unnamed: 0,instance_id,repetition
0,mip1,1
1,mip10,1
2,mip100,1
3,mip1000,1
4,mip10000,1
...,...,...
9995,mip9995,1
9996,mip9996,1
9997,mip9997,1
9998,mip9998,1


In [82]:
# Make a list of length 10000 and randomly replace 0's with numbers from 1 to 10 (each number uses up 10% of the list)
fold_list = []
fold_num = 10

for x in range(1,fold_num+1):
    for _ in range(1000):
        fold_list.append(x)


random.seed(1234)
random.shuffle(fold_list)
print(fold_list)
# Check that the lengths are correct
for i in range(1,fold_num+1):
    print(fold_list.count(i))


folds_df['fold'] = fold_list
folds_df.head()

[2, 3, 9, 4, 5, 6, 10, 3, 8, 8, 7, 4, 3, 3, 2, 2, 2, 9, 6, 6, 5, 9, 9, 6, 8, 4, 6, 1, 1, 5, 9, 7, 9, 9, 10, 6, 3, 8, 6, 6, 1, 7, 7, 9, 4, 2, 7, 3, 3, 8, 7, 6, 6, 5, 10, 7, 1, 5, 8, 5, 8, 6, 8, 9, 7, 8, 1, 10, 1, 9, 3, 1, 3, 5, 8, 10, 6, 4, 10, 2, 1, 10, 9, 8, 1, 1, 2, 9, 5, 6, 10, 3, 6, 9, 8, 10, 8, 1, 6, 2, 2, 7, 2, 6, 6, 10, 3, 5, 5, 6, 10, 3, 8, 5, 3, 2, 10, 4, 8, 2, 8, 5, 2, 3, 2, 5, 10, 10, 6, 5, 4, 4, 4, 3, 7, 10, 3, 5, 8, 8, 7, 7, 5, 3, 6, 4, 3, 1, 2, 7, 2, 8, 8, 6, 2, 8, 9, 4, 9, 6, 9, 6, 2, 10, 6, 2, 9, 3, 6, 2, 1, 8, 9, 4, 9, 3, 4, 2, 6, 10, 10, 3, 3, 8, 9, 8, 5, 10, 9, 7, 4, 8, 10, 10, 10, 6, 10, 8, 6, 10, 8, 7, 4, 3, 7, 10, 1, 2, 3, 8, 8, 2, 10, 4, 8, 3, 10, 2, 3, 2, 2, 8, 10, 9, 4, 7, 8, 4, 3, 10, 4, 1, 4, 7, 9, 8, 10, 7, 8, 8, 9, 10, 9, 4, 4, 7, 1, 1, 3, 7, 9, 2, 9, 6, 6, 1, 3, 2, 2, 4, 4, 7, 1, 1, 6, 6, 2, 3, 9, 4, 1, 1, 7, 4, 6, 9, 8, 6, 7, 2, 9, 4, 6, 4, 3, 1, 2, 8, 5, 3, 6, 8, 8, 5, 7, 1, 4, 9, 1, 7, 7, 5, 1, 8, 5, 5, 7, 8, 9, 5, 3, 8, 8, 5, 7, 4, 6, 1, 8, 1, 3, 9, 2,

Unnamed: 0,instance_id,repetition,fold
0,mip1,1,2
1,mip10,1,3
2,mip100,1,9
3,mip1000,1,4
4,mip10000,1,5


In [83]:
with open("MIPVerifySDP_SCIPfeatureset/cv.arff","a") as file:
    for index,row in folds_df.iterrows():
        file.write("\n")
        newrow = str(list(row)).strip("[]").replace("'", "")
        file.write(newrow)

    file.close()

## Check RE

In [1]:
import re

features_df.columns

NameError: name 'features_df' is not defined