In [None]:
import numpy as np
import pandas as pd
def convert_to_returns(data, keep_high_low=False, keep_volume=True, log_returns=False):
    """
    Convert data to returns.

    args:
        data: pandas DataFrame with "close" and "volume" columns
        log_returns: bool, if True, the data is converted to log returns
        keep_high_low: bool, if True, the high and low prices are kept
        keep_volume: bool, if True, the volume column is kept
    returns:
        pandas DataFrame with "returns" and "volume" columns
    """
    data = pd.DataFrame({"close": data["close"], "volume": data["volume"], "timestamp": data["timestamp"]})
    if log_returns:
        data["returns"] = np.log(data["close"] / data["close"].shift(1))
    else:
        data["returns"] = data["close"] / data["close"].shift(1) - 1
    
    data = data.dropna().reset_index(drop=True)

    final_data = pd.DataFrame()
    final_data['returns'] = data['returns']


    if keep_high_low:
        final_data["high"] = data["high"]
        final_data["low"] = data["low"]

    if keep_volume:
        final_data["volume"] = data["volume"]

    final_data["timestamp"] = data["timestamp"]



    return final_data

def convert_back_to_candlesticks(candlesticks, predicted_returns):
    """
    Convert returns data back to candlesticks. This is used to backtest the model.

    args:
        data: pandas DataFrame with "returns" and "volume" columns
        """
    
    # Make a copy of the candlesticks data
    result = candlesticks.copy()
    
    # Get the last known close price before predictions start
    last_close = result.loc[result.index[predicted_returns['returns_predicted_1'].first_valid_index()-1], 'close']
    
    # Calculate predicted close prices from returns
    for i in range(1, 3):  # For returns_predicted_1 and returns_predicted_2
        col = f'returns_predicted_{i}'
        if col in predicted_returns.columns:
            # Calculate cumulative returns 
            pred_close = last_close * (1 + predicted_returns[col])
            # Rename column
            result[f'close_predicted_{i}'] = pred_close
    
    return result

import pandas as pd
def agg_data(data, n_times):
    """
    Aggregate OHLCV data by duplicating n times and adjusting values accordingly.
    
    Args:
        data: DataFrame with OHLCV data
        n_times: Number of times to duplicate the data
        
    Returns:
        DataFrame with aggregated data
    """
    # Make a copy of original data
    df = data.copy()
    
    # Calculate price adjustments for each duplicate
    # We'll create slight variations around the original prices
    variations = np.linspace(-0.001, 0.001, n_times)
    
    # Initialize list to store duplicated dataframes
    dfs = []
    
    for i in range(n_times):
        temp_df = df.copy()
        
        # Add small variations to prices
        adjustment = 1 + variations[i]
        temp_df['open'] = temp_df['open'] * adjustment
        temp_df['high'] = temp_df['high'] * adjustment
        temp_df['low'] = temp_df['low'] * adjustment
        temp_df['close'] = temp_df['close'] * adjustment
        
        # Divide volume by n_times to distribute it
        temp_df['volume'] = temp_df['volume'] / n_times
        
        # If there are predicted columns, apply the same price adjustment
        pred_cols = [col for col in temp_df.columns if 'predicted' in col]
        for col in pred_cols:
            temp_df[col] = temp_df[col] * adjustment
            
        dfs.append(temp_df)
    
    # Concatenate all duplicated dataframes
    result = pd.concat(dfs, ignore_index=True)
    
    # Sort by timestamp to maintain chronological order
    result = result.sort_values('timestamp').reset_index(drop=True)
    
    return result



In [None]:
import yfinance as yf
import pandas as pd

df = yf.download('BTC-USD', start='2025-01-01', end='2025-01-14', interval='1h')

  df = yf.download('BTC-USD', start='2025-01-01', end='2025-01-14', interval='1h')
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-01-01 00:00:00+00:00,94256.054688,94256.054688,93312.703125,93396.03125,27361280
2025-01-01 01:00:00+00:00,93429.90625,94214.007812,93420.578125,94214.007812,0
2025-01-01 02:00:00+00:00,93849.25,93854.3125,93420.054688,93420.054688,0
2025-01-01 03:00:00+00:00,93614.742188,93866.445312,93551.6875,93866.445312,0
2025-01-01 04:00:00+00:00,93465.992188,93632.890625,93430.546875,93632.890625,0


In [None]:
#Convert to unix time
df["Date"] = pd.to_datetime(df.index, utc=True)
df["Date"] = pd.to_datetime(df["Date"]).astype(int) / 10**9
df.reset_index(drop=True, inplace=True)

# Rearranges and renames columns to be the same as the data before
df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
df.columns = ["timestamp", "open", "high", "low", "close", "volume"]

Unnamed: 0,timestamp,open,high,low,close,volume
0,1735690000.0,93396.03125,94256.054688,93312.703125,94256.054688,27361280
1,1735693000.0,94214.007812,94214.007812,93420.578125,93429.90625,0
2,1735697000.0,93420.054688,93854.3125,93420.054688,93849.25,0
3,1735700000.0,93866.445312,93866.445312,93551.6875,93614.742188,0
4,1735704000.0,93632.890625,93632.890625,93430.546875,93465.992188,0


In [None]:
# Convert to returns and save to csv

data_name = "hourly/ret_h_14d"

df = convert_to_returns(df, keep_volume=False, keep_high_low=False)
df.to_csv(f"dataset/cryptex/{data_name}.csv", index=False)