In [8]:
from sklearn.datasets import load_diabetes
df = load_diabetes(as_frame=True).data
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [12]:
list(df.to_numpy())

[array([ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613]),
 array([-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405]),
 array([ 0.08529891,  0.05068012,  0.04445121, -0.00567042, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286131, -0.02593034]),
 array([-0.08906294, -0.04464164, -0.01159501, -0.03665608,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02268774, -0.00936191]),
 array([ 0.00538306, -0.04464164, -0.03638469,  0.02187239,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03198764, -0.04664087]),
 array([-0.09269548, -0.04464164, -0.04069594, -0.01944183, -0.06899065,
        -0.07928784,  0.04127682, -0.0763945 , -0.04117617, -0.09634616]),
 array([-0.04547248,  0.05068012, -0.04716281, -0.01599898, -0.04009564,
        -0.02480001,  0.00077881, -0.03

In [19]:
df.columns.tolist()

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [None]:
from typing import List, Dict
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

class ExprTree:
    def __init__(self):
        self.isLeaf = True

        # for leave node:
        self.col = None
        self.condition = lambda x: True

        # for non leaf node:
        self.children = []
        self.aggr = lambda x: any(x)

    def leaf(self, col, cond):
        self.isLeaf = True
        self.col = col
        self.condition = cond

    def non_leaf(self, children, aggr):
        self.isLeaf = False
        self.children = children
        self.aggr = aggr


    def eval(self, row: Dict[str, float]):
        if self.isLeaf:
            return self.condition(row[self.col])
        else:
            deps = [node.eval(row) for node in self.children]
            return self.aggr(deps)



class Table:
    def __init__(self, df):
        self.data = df.values.tolist()
        self.columns = df.columns.tolist()
        self.col_idx_map = {
            name: idx for idx, name in enumerate(self.columns)
        }

    def get_row_mask(self, node, num_workers = 5):
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            futures = []
            for row in self.data:
                row_dict = {
                    col: row[i] for i, col in enumerate(self.columns)
                }
                futures.append(executor.submit(node.eval, row_dict))

            return [f.result() for f in futures]
    
    def to_df(self):
        return pd.DataFrame(data = self.data, columns=self.columns)


class Query:
    def __init__(self):
        self._select = []
        self._from = None
        self._where = None
        self._orderBy = None

    def From(self, tbl):
        self._from = tbl
        return self
    
    def select(self, cols):
        self._select = cols
        return self

    def where(self, rootNode):
        self._where = rootNode
        return self

    def orderBy(self, col):
        self._orderBy = col
        return self

    def run(self):
        if self._where is None:
            row_mask = None
        else:
            row_mask = self._from.get_row_mask(self._where)

        ret = []
        idx = [self._from.col_idx_map[col] for col in self._select]
        for i, row in enumerate(self._from.data):
            if row_mask is None or row_mask[i]:
                ret.append([row[j] for j in idx])

        if self._orderBy is not None:
            def get_key(row):
                for i in range(len(self._select)):
                    if self._select[i] == self._orderBy:
                        return row[i]
            ret.sort(key=get_key)
        

        return pd.DataFrame(data=ret, columns=self._select)



In [61]:
tbl = Table(df)
query = Query()
filter = ExprTree()
filter.leaf('s3', lambda x: x>0)
query.select(['age', 'sex', 's3']).From(tbl).where(filter).orderBy('sex')
query.run()

Unnamed: 0,age,sex,s3
0,-0.001882,-0.044642,0.074412
1,0.005383,-0.044642,0.008142
2,-0.092695,-0.044642,0.041277
3,0.016281,-0.044642,0.044958
4,0.045341,-0.044642,0.081775
...,...,...,...
194,-0.005515,0.050680,0.048640
195,-0.041840,0.050680,0.011824
196,-0.085430,0.050680,0.019187
197,-0.078165,0.050680,0.026550


In [58]:
ref = df[df['s3'] > 0]
ref = ref[['age', 'sex', 's3']]
ref.sort_values(by='sex')

Unnamed: 0,age,sex,s3
1,-0.001882,-0.044642,0.074412
237,0.056239,-0.044642,0.044958
239,0.023546,-0.044642,0.063367
245,-0.027310,-0.044642,0.030232
246,0.041708,-0.044642,0.056003
...,...,...,...
98,0.001751,0.050680,0.008142
354,-0.023677,0.050680,0.000779
356,-0.005515,0.050680,0.015505
72,0.063504,0.050680,0.056003
