In [211]:
import pandas as pd

df = pd.read_parquet('./data/')
df.drop('stabilized_amount_max', axis=1, inplace=True)

df.loc[df['wave_direction'] == 'min', 'wave_direction'] = 1
df.loc[df['wave_direction'] == 'max', 'wave_direction'] = -1
df = df[df['stabilized_spread'] >= 0.2]
df = df[df['stabilized_spread'] < 4]
df

Unnamed: 0_level_0,stabilized_at_ms,stabilized_nr_trades,stabilized_amount_mean,stabilized_spread,wave_direction,last_price_delta_since_stabilized
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,484,20,0.0297,1.01,1,0.25
0,338,40,0.0413,1.70,-1,-0.03
0,13,33,0.0163,0.55,-1,-8.88
0,104,14,0.0572,1.00,-1,0.00
0,329,18,0.0118,0.74,1,0.35
...,...,...,...,...,...,...
0,94,46,0.0507,2.36,1,1.68
0,19,8,0.0381,0.86,1,0.00
0,144,11,0.0463,0.54,1,0.50
0,250,10,0.0488,1.03,1,0.00


In [206]:
df['last_price_delta_since_stabilized'].max()

14.66

In [212]:
from sklearn.model_selection import train_test_split

y = df['last_price_delta_since_stabilized']
X = df.drop('last_price_delta_since_stabilized', axis=1)
X = X.reset_index(drop=True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X

Unnamed: 0,stabilized_at_ms,stabilized_nr_trades,stabilized_amount_mean,stabilized_spread,wave_direction
0,484,20,0.0297,1.01,1
1,338,40,0.0413,1.70,-1
2,13,33,0.0163,0.55,-1
3,104,14,0.0572,1.00,-1
4,329,18,0.0118,0.74,1
...,...,...,...,...,...
11752,94,46,0.0507,2.36,1
11753,19,8,0.0381,0.86,1
11754,144,11,0.0463,0.54,1
11755,250,10,0.0488,1.03,1


In [213]:
from datetime import datetime
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=100, random_state=0, max_depth=5)
model.fit(X_train, y_train)

preds = model.predict(X_valid)
preds
y_valid.reset_index(drop=True)

mae = round(mean_absolute_error(y_valid, preds), 5)
print(f'mean absolute error: {mae}')

mean absolute error: 0.68057


In [189]:

rec = len(X_train)
filename = f'models/{datetime.utcnow().replace(microsecond=0).isoformat()}-rec-{rec}-mae-{mae}.sav'
pickle.dump(model, open(filename, 'wb'))
print(f'saved model to {filename}')

mean absolute error: 0.69432
saved model to models/2023-01-19T18:39:36-rec-7323-mae-0.69432.sav


In [136]:
model = pickle.load(open(filename, 'rb'))
wave_frame = [{'stabilized_at_ms': 14, 'stabilized_nr_trades': 13, 'stabilized_amount_mean': 0.0344, 'stabilized_spread': 0.19, 'wave_direction': 'min', 'last_price_delta_since_stabilized': 1.26}]
df = pd.DataFrame(wave_frame)
df.loc[df['wave_direction'] == 'min', 'wave_direction'] = 1
df.loc[df['wave_direction'] == 'max'] = -1
df.drop('last_price_delta_since_stabilized', axis=1, inplace=True)
result = model.predict(df)
print(result)


[0.81305]


### pipeline approach

In [100]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split


class FeatureEngineer(BaseEstimator, TransformerMixin):
    def transform(self, X, y=None):
        df = X.copy()
        df.loc[df['wave_direction'] == 'min', 'wave_direction'] = 1
        df.loc[df['wave_direction'] == 'max'] = -1
        df.drop('last_price_delta_since_stabilized', axis=1, inplace=True)
        return df


pipeline = Pipeline(steps=[
    ('preprocessor', FeatureEngineer()),
    ('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

df = pd.read_parquet('./data')
y = df['last_price_delta_since_stabilized']
X = df.drop('last_price_delta_since_stabilized', axis=1)
X = X.reset_index(drop=True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.2, test_size=0.2, random_state=0)
pipeline.fit(X_train, y_train)



AttributeError: 'FeatureEngineer' object has no attribute 'fit'