In [1]:
import os
import random
import pandas as pd
from datetime import datetime

DATA_DIR = "/home/lichenhui/2023/"

def select_tickers(n, data_dir=DATA_DIR):
    """
    Randomly selects n tickers from a folder and returns cleaned ticker names.
    Example: 'SZ.002521.csv' → '002521.csv'
    """
    all_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]
    selected_files = random.sample(all_files, n)
    # Convert 'SZ.002521.csv' → '002521.csv'
    tickers = [f for f in selected_files]
    return tickers


def load_target_return(ticker, data_dir=DATA_DIR):
    """
    Loads the ticker file, downsamples to 3-min intervals using last close,
    then computes 3-min close-to-close returns.
    """
    full_path = os.path.join(data_dir, ticker)
    df = pd.read_csv(full_path)
    print(df.head(-10))
    # Parse datetime string to datetime object
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').set_index('datetime')

    t1, t2 = datetime.strptime("09:30:00", "%H:%M:%S").time(), datetime.strptime("11:30:00", "%H:%M:%S").time()
    t3, t4 = datetime.strptime("13:00:00", "%H:%M:%S").time(), datetime.strptime("14:57:00", "%H:%M:%S").time()
    df = df[(df.index.time >= t1) & (df.index.time < t2) | (df.index.time >= t3) & (df.index.time < t4)]

    # Downsample: keep last close in each 3-min bin
    df_resampled = df['close'].resample('3min').last().dropna()
    print(df_resampled.head(10))
    # Compute returns
    returns = df_resampled.pct_change().fillna(0)
    return returns.rename('returns')


In [2]:
tickers = select_tickers(5)
for ticker in tickers:
    
    returns = load_target_return(ticker)
    print(f"{ticker} 3-min returns:\n", returns.head(10))


                  datetime   open   high    low  close  volume   amount
0      2023-01-03 09:31:00   9.88   9.92   9.85   9.91     757   748387
1      2023-01-03 09:32:00   9.90   9.97   9.90   9.96    1377  1369503
2      2023-01-03 09:33:00   9.94   9.98   9.94   9.96     737   734140
3      2023-01-03 09:34:00   9.96   9.96   9.94   9.95     131   130485
4      2023-01-03 09:35:00   9.95   9.96   9.94   9.94     151   150211
...                    ...    ...    ...    ...    ...     ...      ...
58065  2023-12-29 14:46:00  11.23  11.23  11.22  11.23     141   158208
58066  2023-12-29 14:47:00  11.22  11.22  11.22  11.22      56    62829
58067  2023-12-29 14:48:00  11.22  11.23  11.21  11.23      91   102093
58068  2023-12-29 14:49:00  11.22  11.23  11.21  11.22     161   180663
58069  2023-12-29 14:50:00  11.22  11.23  11.21  11.23     129   144720

[58070 rows x 7 columns]
datetime
2023-01-03 09:30:00     9.96
2023-01-03 09:33:00     9.94
2023-01-03 09:36:00     9.97
2023-01-03 09:

KeyError: 'datetime'