In [1]:
# pip install frozendict 
# pip install pyarrow
# pip install -e ./streamline_package
# pip install ray[default]

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from streamline.delayed import delay_lib, step, Delayed, eval_delay
from streamline import RunEnv, load_runenv, Pipeline, Function, Var
from streamline.utilities import mainify

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator

import statsmodels.api as sm
import statsmodels

from typing import *


def gen_train_pipe(
        pipe: Optional[Pipeline]=None,
    
        var_y: Optional[str]='y',
        var_X: Optional[str]='X',
    
        tags: Optional[Set]=None,
        prefix: Optional[str]='',

        model: Optional=sm.GLM,
        kw_def: Optional[Dict]=None,
        kw_fit: Optional[Dict]=None,
    ):
    if pipe is None:
        pipe = Pipeline()
    if tags is None:
        tags = set([])
    if kw_def is None:
        kw_def = dict()
    if kw_fit is None:
        kw_fit = dict()

    if issubclass(model, statsmodels.base.model.Model):
        in_vars_def = [Var(var_y), Var(var_X)]
        in_vars_fit = []
    elif issubclass(model, BaseEstimator):
        in_vars_def = []
        in_vars_fit = [Var(var_X), Var(var_y)]
    else:
        raise NotImplementedError
    
    pipe.add_step(
        Function(
            model,
            args=in_vars_def,
            out_var=f'{prefix}model',
            **kw_def,
            tags={'glm', 'train_only', 'model_def'}|tags,
            arg_cat=f'{prefix}model_def',
        ),
    )
    pipe.add_step(
        Function(
            lambda model, *a, **kw: model.fit(*a, **kw),
            args=[Var(f'{prefix}model')]+in_vars_fit,
            out_var=f'{prefix}result',
            **kw_fit,
            tags={'glm', 'train_only', 'model_fit'}|tags,
            arg_cat=f'{prefix}model_fit',
        ),
    )
    return pipe

pipe = gen_train_pipe(model=LinearRegression)

In [5]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True)

In [6]:
res_env = pipe.run({
    'X': X,
    'y': y,
}, kw={
    # 'model_fit.cov_type': 'HC1'
})

In [7]:
sm.OLS(X, y, offset=None)

<statsmodels.regression.linear_model.OLS at 0x1675cfd14d0>