# data
> Utilities for processing input data

In [None]:
#| default_exp data

In [None]:
#| hide
from fastcore.all import *
from nbdev import show_doc
from typing import Dict, List

In [None]:
#| export
import pandas as pd
import numpy as np
import sklearn.datasets as ds
import sklearn.model_selection as ms
import sklearn.ensemble as en
from sklearn import linear_model

from pathlib import Path
from fastcore.all import *

# Loading data from CSV

In [None]:
#| export
def load_market_data_file(market_data_file: Path # csv file to laod data from
    ) -> pd.DataFrame: # data frame with transactions
    """Loads a single market data file"""
    df = pd.read_csv(market_data_file, compression="gzip")
    df['date'] = pd.to_datetime(df.local_timestamp, unit='us')
    df.set_index('date', inplace=True)
    return df
    


In [None]:
#| export
def get_symbols(market_data_path: Path # directory with market data
               ) -> List[str]: # List of symbols that have market data available
    "Returns a list of all symbols available at given market data dump directory"
    def extract_symbol(p:Path): 
        return (p.name.split('.')[1])

    return list(set(map(extract_symbol, market_data_path.ls())))

In [None]:
#| export
def load_all_market_data_files_for_symbol(market_data_path: Path, # directory with market data
                symbol: str # name of the symbol
               ) -> pd.DataFrame : # dataframe with transactions 
    "Loads all data for the given symbol"
    result = None
    for file in market_data_path.glob(f"*.{symbol}.csv.gz"): 
        print(f'processing file {file}')
        df = load_market_data_file(file)
        result = df if result is None else pd.concat([result,df])
    return result.sort_index()
    

In [None]:
#| export
def make_sequential_stream(data_frames: List[pd.DataFrame] # frames with transacations
                          ) -> pd.DataFrame: # sequential streamX
    "Takes a dictionary of data frames and merges them together according to the timestamps (index)."
    r = pd.concat(data_frames)
    return r.sort_index()


## Examples of how data is processed

In [None]:
market_data_dir=Path('../marketdata')
symbols=get_symbols(market_data_dir); symbols

['MATIC', 'OP', 'XRP']

In [None]:
loaded_data = dict((sym, load_all_market_data_files_for_symbol(market_data_dir, sym)) for sym in symbols)

processing file ../marketdata/20230501.MATIC.csv.gz
processing file ../marketdata/20231001.MATIC.csv.gz
processing file ../marketdata/20230201.MATIC.csv.gz
processing file ../marketdata/20230301.MATIC.csv.gz
processing file ../marketdata/20231101.MATIC.csv.gz
processing file ../marketdata/20230401.MATIC.csv.gz
processing file ../marketdata/20230601.MATIC.csv.gz
processing file ../marketdata/20230101.MATIC.csv.gz
processing file ../marketdata/20230901.MATIC.csv.gz
processing file ../marketdata/20230801.MATIC.csv.gz
processing file ../marketdata/20230701.MATIC.csv.gz
processing file ../marketdata/20231201.MATIC.csv.gz
processing file ../marketdata/20230901.OP.csv.gz
processing file ../marketdata/20231001.OP.csv.gz
processing file ../marketdata/20230501.OP.csv.gz
processing file ../marketdata/20230601.OP.csv.gz
processing file ../marketdata/20230301.OP.csv.gz
processing file ../marketdata/20230401.OP.csv.gz
processing file ../marketdata/20231101.OP.csv.gz
processing file ../marketdata/202

In [None]:
loaded_data['MATIC'].head()

Unnamed: 0_level_0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-01 00:00:02.789350,bybit,MATICUSDT,1672531202718000,1672531202789350,879e0dcd-cee1-5d5d-b031-2c8f18f14d75,buy,0.758,2752
2023-01-01 00:00:06.276571,bybit,MATICUSDT,1672531206141000,1672531206276571,cf0bdde5-10ad-5cd1-acf4-8ced48b3b5fd,buy,0.758,1790
2023-01-01 00:00:07.272223,bybit,MATICUSDT,1672531207221000,1672531207272223,814aa74c-3209-501f-9b49-16ae0a3fc930,sell,0.7579,673
2023-01-01 00:00:07.872568,bybit,MATICUSDT,1672531207791000,1672531207872568,422980a6-3f9e-5f3b-a514-4c48a14d30c1,buy,0.758,21
2023-01-01 00:00:11.472638,bybit,MATICUSDT,1672531211418000,1672531211472638,b27b754d-d32d-533d-9d54-a5ab81559f29,sell,0.7579,279


In [None]:
d = make_sequential_stream(loaded_data.values())

Here we have merged all individual symbol data into a single data frame.

The data frame can be used as a "market feed" to the market engine.

In [None]:
d.head(10)

Unnamed: 0_level_0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-01 00:00:01.916181,bybit,XRPUSDT,1672531201785000,1672531201916181,fc9b084c-f895-502d-b69d-d0912173d089,buy,0.3391,1.0
2023-01-01 00:00:01.916181,bybit,XRPUSDT,1672531201785000,1672531201916181,f2ee430c-9576-5bcf-aac6-5243292797ac,buy,0.3391,2.0
2023-01-01 00:00:01.916181,bybit,XRPUSDT,1672531201785000,1672531201916181,869f353f-af7d-51f5-ae85-4c006f5322b7,buy,0.3391,1458.0
2023-01-01 00:00:02.215099,bybit,XRPUSDT,1672531202078000,1672531202215099,9da1a13d-de98-570c-a070-ff97a3549058,sell,0.339,1.0
2023-01-01 00:00:02.315516,bybit,XRPUSDT,1672531202240000,1672531202315516,74cc44d2-8625-5530-ba3e-8c1f823c98db,sell,0.339,80.0
2023-01-01 00:00:02.618189,bybit,XRPUSDT,1672531202534000,1672531202618189,0987dc83-a13c-514c-8877-123a2441c2f4,sell,0.339,70.0
2023-01-01 00:00:02.789350,bybit,MATICUSDT,1672531202718000,1672531202789350,879e0dcd-cee1-5d5d-b031-2c8f18f14d75,buy,0.758,2752.0
2023-01-01 00:00:03.417761,bybit,XRPUSDT,1672531203361000,1672531203417761,2383e7c1-c42f-5193-b8b7-a9e53cd197f6,sell,0.339,132.0
2023-01-01 00:00:03.515103,bybit,XRPUSDT,1672531203456000,1672531203515103,6dff80b1-3ba6-5d44-82fa-9be768300661,sell,0.339,110.0
2023-01-01 00:00:04.114629,bybit,XRPUSDT,1672531204030000,1672531204114629,22f32e05-fa32-5dd5-bc55-1fd0ac4a284b,sell,0.339,1179.0


In [None]:
len(d)

6548374

We have about 6m of data points

In [None]:
d['symbol']

date
2023-01-01 00:00:01.916181    XRPUSDT
2023-01-01 00:00:01.916181    XRPUSDT
2023-01-01 00:00:01.916181    XRPUSDT
2023-01-01 00:00:02.215099    XRPUSDT
2023-01-01 00:00:02.315516    XRPUSDT
                               ...   
2023-12-01 23:59:59.538142    XRPUSDT
2023-12-01 23:59:59.538142    XRPUSDT
2023-12-01 23:59:59.538142    XRPUSDT
2023-12-01 23:59:59.538142    XRPUSDT
2023-12-01 23:59:59.565317    XRPUSDT
Name: symbol, Length: 6548374, dtype: object

... across 3 symbols.

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()