In [1]:
from gbd_tool.gbd_api import GBD
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

dir = "/home/iser/git/gbd-data/"
dbs = ["meta.db", "base.db"]
with GBD([dir + db for db in dbs]) as gbd:
    features = gbd.get_features(dbname="base_db") + ["family"]
    df = gbd.query_search2("family != unknown", resolve=features)
    df.drop(["hash"], axis=1, inplace=True)
    y = df.pop("family").astype("category").cat.codes.to_numpy()
    x = np.nan_to_num(df.to_numpy().astype(np.float32))
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
    model = ensemble.RandomForestClassifier()
    model.fit(xtrain, ytrain)
    acc = accuracy_score(ytest, model.predict(xtest))
    print("Accuracy: {}".format(acc))
    

Accuracy: 0.9723829072588169


In [14]:
from gbd_tool.gbd_api import GBD
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

dir = "/home/iser/git/gbd-data/"
dbs = ["meta.db", "base.db", "gate.db", "sc2020.db"]
with GBD([dir + db for db in dbs]) as gbd:
    solvers = ["kissat_unsat", "relaxed_newtech"]
    features = gbd.get_features(dbname="base_db") + gbd.get_features(dbname="gate_db")
    df = gbd.query_search2("track = main_2020", resolve=features+solvers, replace=[("timeout", np.inf), ("memout", np.inf), ("empty", np.nan), ("failed", np.inf)]).copy()
    # create solver column
    df["solver"] = "empty"
    for s in solvers:
        for idx, row in df.iterrows():
            if float(row[s]) == min(row[solvers].astype(float)):
                row["solver"] = s
    df.drop(solvers, axis=1, inplace=True)
    df.drop(["hash"], axis=1, inplace=True)
    
    y = df.pop("solver").astype("category").cat.codes.to_numpy()
    x = np.nan_to_num(df.to_numpy().astype(np.float32), nan=-1)
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
    model = ensemble.RandomForestClassifier()
    model.fit(xtrain, ytrain)
    acc = accuracy_score(ytest, model.predict(xtest))
    print("Accuracy: {}".format(acc))
    
        

Accuracy: 0.775


In [3]:
from gbd_tool.gbd_api import GBD
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

dir = "/home/iser/git/gbd-data/"
dbs = ["meta.db", "base.db", "gate.db"]
with GBD([dir + db for db in dbs]) as gbd:
    features = gbd.get_features(dbname="base_db") + gbd.get_features(dbname="gate_db") + ["result"]
    df = gbd.query_search2("result != unknown", resolve=features, replace=[("timeout", np.inf), ("memout", np.inf), ("empty", np.nan), ("failed", np.inf)]).copy()
    df.drop(["hash"], axis=1, inplace=True)
    y = df.pop("result").astype("category").cat.codes.to_numpy()
    x = np.nan_to_num(df.to_numpy().astype(np.float32))
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
    model = ensemble.RandomForestClassifier()
    model.fit(xtrain, ytrain)
    acc = accuracy_score(ytest, model.predict(xtest))
    print("Accuracy: {}".format(acc))

Accuracy: 0.9099462365591398
