In [1]:
%load_ext autoreload
%autoreload 2
from potok.core import Data, DataUnit, DataLayer, ApplyToDataUnit, Node, Layer, Pipeline
from potok.tabular import TabularData, Folder, Validation, LightGBM

In [2]:
import numpy as np
import pandas as pd


In [3]:
import ray
ray.init(num_cpus=8)

2021-05-04 18:28:28,353	INFO services.py:1262 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.0.103',
 'raylet_ip_address': '192.168.0.103',
 'redis_address': '192.168.0.103:6379',
 'object_store_address': '/tmp/ray/session_2021-05-04_18-28-27_292736_518076/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-04_18-28-27_292736_518076/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-04_18-28-27_292736_518076',
 'metrics_export_port': 59599,
 'node_id': '9405a46e3ddacf4322a414bb2958f070f218c4e34d7d38f58e91e2dc'}

In [4]:
import scipy.stats as sst

class SyntheticData:
    """
    Example:
    import scipy.stats as sst
    gene = SyntheticData(sst.norm(loc=0, scale=2), sst.norm(loc=1, scale=3), np.square)
    gene.create_train()
    gene.create_test()
    """
    def __init__(self, pdf_train, pdf_test, target_f, seed=None):
        self.pdf_train = pdf_train
        self.pdf_test = pdf_test
        self.target_f = target_f
        self.seed = seed
    
    def _create_sample_(self, pdf, size, noize_sigma=None):
        X = pdf.rvs(size=size, random_state=self.seed)
        y = self.target_f(X)
        
        if noize_sigma is not None:
            y_noize = np.random.normal(0, noize_sigma, size)
            y += y_noize
            
        df = pd.DataFrame({'X': X, 'Target': y}, index=list(range(size)))
        
        return df
        
    def create_train(self, size=1000000, noize_sigma=0.5):
        df_train = self._create_sample_(self.pdf_train, size, noize_sigma)
        return TabularData(df_train, target=['Target'])
        
    def create_test(self, size=100000, noize_sigma=None):
        data_test = self._create_sample_(self.pdf_test, size, noize_sigma)
        return TabularData(data_test, target=['Target'])

In [5]:
gene = SyntheticData(sst.norm(loc=0, scale=2), sst.norm(loc=1, scale=3), np.square)
train = gene.create_train()
test = gene.create_test()

In [6]:
data = DataUnit(train=train, test=test)

In [7]:
f = Folder(n_folds=4)

In [8]:
valid = Validation(f)

In [9]:
algo = LightGBM(target='Target', features=['X'])

In [10]:
model = Pipeline(valid, algo)

In [11]:
x = DataLayer(data.X)
y = DataLayer(data.Y)

In [None]:
%%prun
yy = model.fit_predict(x, y)

[2m[36m(pid=518260)[0m Training Model LightGBM
[2m[36m(pid=518260)[0m X_train = (750000, 1) y_train = (750000,)
[2m[36m(pid=518260)[0m X_valid = (250000, 1) y_valid = (250000,)
[2m[36m(pid=518260)[0m Training until validation scores don't improve for 50 rounds
[2m[36m(pid=518256)[0m Training Model LightGBM
[2m[36m(pid=518256)[0m X_train = (750000, 1) y_train = (750000,)
[2m[36m(pid=518256)[0m X_valid = (250000, 1) y_valid = (250000,)
[2m[36m(pid=518261)[0m Training Model LightGBM
[2m[36m(pid=518261)[0m X_train = (750000, 1) y_train = (750000,)
[2m[36m(pid=518261)[0m X_valid = (250000, 1) y_valid = (250000,)
[2m[36m(pid=518262)[0m Training Model LightGBM
[2m[36m(pid=518262)[0m X_train = (750000, 1) y_train = (750000,)
[2m[36m(pid=518262)[0m X_valid = (250000, 1) y_valid = (250000,)
[2m[36m(pid=518261)[0m Training until validation scores don't improve for 50 rounds
[2m[36m(pid=518256)[0m Training until validation scores don't improve for 50 ro

In [None]:
77.522 seconds