In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        print(f'fitting on {len(X)} records')
        return self

    def transform(self, X, y=None):
        df = X.copy()
        df.loc[df['wave_direction'] == 'min', 'wave_direction'] = 1
        df.loc[df['wave_direction'] == 'max'] = -1
        df = df.astype({"wave_direction": 'float64'})
        return df


pipeline = Pipeline(steps=[
    ('preprocessor', FeatureEngineer()),
    ('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

df = pd.read_parquet('./data')
df = df[abs(df['last_price_delta_since_stabilized']) < 10]
df = df[abs(df['last_price_delta_since_stabilized']) > 0.2]

if '0_price' in df.columns:
    df.drop('0_price', axis=1, inplace=True)
    df.drop('1_price', axis=1, inplace=True)
    df.drop('2_price', axis=1, inplace=True)
    df.drop('3_price', axis=1, inplace=True)
    df.drop('4_price', axis=1, inplace=True)

# df = df[df['stabilized_spread'] < 10] # nem tul nagy spread
# df = df[df['stabilized_amount_mean'] < 0.4]  # nem tul nagy tradek
# df = df[df['stabilized_nr_trades'] < 300]  # nem tul sok trade

# for col in df.columns:
#     if 'nr_trades' in col or 'price_delta' in col and not col.startswith('last'):
#         df.drop(col, axis=1, inplace=True)
# df.drop('wave_direction', axis=1, inplace=True)

df.sort_index(axis=1, inplace=True)

y = df['last_price_delta_since_stabilized']
X = df.drop('last_price_delta_since_stabilized', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
pipeline.fit(X_train, y_train)
display(pipeline.score(X_valid, y_valid))


fitting on 11661 records


0.5584306348406086

In [None]:
sample = pd.DataFrame([
    {'0_amount_mean': 0.094411, '0_nr_trades': 75.0, '0_price_delta': 0.61, '0_spread': 1.44, '1_amount_mean': 0.093298, '1_nr_trades': 76.0, '1_price_delta': 0.61, '1_spread': 1.44, '2_amount_mean': 0.092106, '2_nr_trades': 77.0,
     '2_price_delta': 0.61, '2_spread': 1.44, '3_amount_mean': 0.084998, '3_nr_trades': 85.0, '3_price_delta': 0.11, '3_spread': 1.94, '4_amount_mean': 0.084199, '4_nr_trades': 86.0, '4_price_delta': 0.04, '4_spread': 2.01,
     'last_price_delta_since_stabilized': 1.97, 'stabilized_amount_mean': 0.08325, 'stabilized_at_ms': 123, 'stabilized_nr_trades': 87.0, 'stabilized_spread': 2.05, 'wave_direction': 'min'}
])
sample.drop('last_price_delta_since_stabilized', axis=1, inplace=True)
# sample.drop('wave_direction', axis=1, inplace=True)
pipeline.predict(sample)
