In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
@dataclass
class PriceRow:
    day: int
    timestamp: int
    product: str
    bid_prices: list[int]
    bid_volumes: list[int]
    ask_prices: list[int]
    ask_volumes: list[int]
    mid_price: float
    profit_loss: float

@dataclass
class TradeRow:
    timestamp: int
    symbol: str
    currency: str
    price: float
    quantity: int

@dataclass
class DayData:
    prices: list[PriceRow]
    trades: list[TradeRow]

def get_column_values(columns: list[str], indices: list[str]) -> list[int]:
    values = []

    for index in indices:
        value = columns[index]
        if value != "":
            values.append(int(value))

    return values

def read_data(directory_name: str) -> list[DayData]:
    days = []

    directory = Path.cwd().parent.parent / "data" / directory_name
    files = sorted(list(directory.iterdir()))

    for prices_file in files[:len(files) // 2]:
        day = prices_file.stem.split("_")[-1]
        trades_file = next(file for file in files if file.name.endswith(f"_{day}_nn.csv"))

        prices = []
        price_lines = prices_file.read_text(encoding="utf-8").splitlines()
        for line in price_lines[1:]:
            columns = line.split(";")

            prices.append(PriceRow(
                day=int(columns[0]),
                timestamp=int(columns[1]),
                product=columns[2],
                bid_prices=get_column_values(columns, [3, 5, 7]),
                bid_volumes=get_column_values(columns, [4, 6, 8]),
                ask_prices=get_column_values(columns, [9, 11, 13]),
                ask_volumes=get_column_values(columns, [10, 12, 14]),
                mid_price=float(columns[15]),
                profit_loss=float(columns[16]),
            ))

        trades = []
        trade_lines = trades_file.read_text(encoding="utf-8").splitlines()
        for line in trade_lines[1:]:
            columns = line.split(";")

            trades.append(TradeRow(
                timestamp=int(columns[0]),
                symbol=columns[3],
                currency=columns[4],
                price=float(columns[5]),
                quantity=int(columns[6]),
            ))

        days.append(DayData(prices, trades))

    return days

data = read_data("round2")

In [3]:
def prepare_data(days: list[DayData]) -> pd.DataFrame:
    X_rows = []
    y_rows = []

    for day in days:
        day_rows = []
        price_history_by_product = defaultdict(list)

        for row in day.prices:
            if row.product != "PINA_COLADAS":
                continue

            price_history_by_product[row.product].append(row)

            if len(price_history_by_product[row.product]) <= 100:
                continue

            bid_ask_features = {}
            for prefix, arr in [
                ("bid_price", "bid_prices"),
                ("bid_volume", "bid_volumes"),
                ("ask_price", "ask_prices"),
                ("ask_volume", "ask_volumes"),
            ]:
                current_values = getattr(row, arr)

                for i in range(1):
                    current_value = current_values[i] if i < len(current_values) else 0
                    bid_ask_features[f"{prefix}_{i}"] = current_value

                    for j in [1, 2, 3, 4, 5, 10, 25, 50, 75, 100]:
                        history_values = getattr(price_history_by_product[row.product][-j - 1], arr)
                        previous_value = history_values[i] if i < len(history_values) else 0
                        bid_ask_features[f"{prefix}_{i}_-{j}"] = previous_value
                        bid_ask_features[f"{prefix}_{i}_-{j}_delta"] = current_value - previous_value
                        bid_ask_features[f"{prefix}_{i}_-{j}_delta_relative"] = (current_value - previous_value) / j

            mid_price_features = {"mid_price": row.mid_price}
            for i in [1, 2, 3, 4, 5, 10, 25, 50, 75, 100]:
                history_row = price_history_by_product[row.product][-j - 1]
                mid_price_features[f"mid_price_-{i}"] = history_row.mid_price
                mid_price_features[f"mid_price_-{i}_delta"] = row.mid_price - history_row.mid_price
                mid_price_features[f"mid_price_-{i}_delta_relative"] = (row.mid_price - history_row.mid_price) / i

            day_rows.append({
                **bid_ask_features,
                **mid_price_features,
            })

        future_days = 50
        for i in range(len(day_rows) - future_days):
            X_rows.append({
                **day_rows[i],
            })

            current_bid = day_rows[i]["bid_price_0"]
            current_ask = day_rows[i]["ask_price_0"]
            future_bid = day_rows[i + future_days]["bid_price_0"]
            future_ask = day_rows[i + future_days]["ask_price_0"]

            long_profit = future_bid - current_ask
            short_profit = future_ask - current_bid

            if long_profit > 5:
                y_rows.append(1)
            else:
                y_rows.append(0)

    return pd.DataFrame(X_rows), np.array(y_rows)

X, y = prepare_data(data)
X

Unnamed: 0,bid_price_0,bid_price_0_-1,bid_price_0_-1_delta,bid_price_0_-1_delta_relative,bid_price_0_-2,bid_price_0_-2_delta,bid_price_0_-2_delta_relative,bid_price_0_-3,bid_price_0_-3_delta,bid_price_0_-3_delta_relative,...,mid_price_-25_delta_relative,mid_price_-50,mid_price_-50_delta,mid_price_-50_delta_relative,mid_price_-75,mid_price_-75_delta,mid_price_-75_delta_relative,mid_price_-100,mid_price_-100_delta,mid_price_-100_delta_relative
0,15021,15020,1,1.0,15020,1,0.5,15019,2,0.666667,...,0.90,15000.0,22.5,0.45,15000.0,22.5,0.300000,15000.0,22.5,0.225
1,15023,15021,2,2.0,15020,3,1.5,15020,3,1.000000,...,0.98,15000.0,24.5,0.49,15000.0,24.5,0.326667,15000.0,24.5,0.245
2,15027,15023,4,4.0,15021,6,3.0,15020,7,2.333333,...,1.16,14999.5,29.0,0.58,14999.5,29.0,0.386667,14999.5,29.0,0.290
3,15027,15027,0,0.0,15023,4,2.0,15021,6,2.000000,...,1.14,15000.0,28.5,0.57,15000.0,28.5,0.380000,15000.0,28.5,0.285
4,15027,15027,0,0.0,15027,0,0.0,15023,4,1.333333,...,1.18,14999.0,29.5,0.59,14999.0,29.5,0.393333,14999.0,29.5,0.295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29545,14855,14857,-2,-2.0,14858,-3,-1.5,14856,-1,-0.333333,...,-0.34,14864.5,-8.5,-0.17,14864.5,-8.5,-0.113333,14864.5,-8.5,-0.085
29546,14853,14855,-2,-2.0,14857,-4,-2.0,14858,-5,-1.666667,...,-0.36,14863.0,-9.0,-0.18,14863.0,-9.0,-0.120000,14863.0,-9.0,-0.090
29547,14854,14853,1,1.0,14855,-1,-0.5,14857,-3,-1.000000,...,-0.28,14862.5,-7.0,-0.14,14862.5,-7.0,-0.093333,14862.5,-7.0,-0.070
29548,14857,14854,3,3.0,14853,4,2.0,14855,2,0.666667,...,-0.16,14862.0,-4.0,-0.08,14862.0,-4.0,-0.053333,14862.0,-4.0,-0.040


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88      5267
           1       0.83      0.45      0.58      2121

    accuracy                           0.82      7388
   macro avg       0.82      0.71      0.73      7388
weighted avg       0.82      0.82      0.80      7388

