In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [2]:
# read df
df = pd.read_csv('Data/gold_futures_ohlcv.csv', parse_dates=['ts_event'])
df["ts_event"] = pd.to_datetime(df["ts_event"]).dt.tz_convert('America/New_York') # ensure it's in the correct timezone
df = df.set_index('ts_event', inplace=False)

In [3]:
# filter for vanilla futures contracts
single_contract_filter = (df["symbol"].str.len() == 4)
fdf = df[single_contract_filter].copy()

In [4]:
future_month_map = {
    'F': 1, 'G': 2, 'H': 3, 'J': 4, 'K': 5, 'M': 6,
    'N': 7, 'Q': 8, 'U': 9, 'V': 10, 'X': 11, 'Z': 12
}

In [5]:
# function to determine contract year
def get_contract_year(row):
    contract_year_first_digit = int(row["symbol"][3])
    curr_year = row.name.year
    if contract_year_first_digit < curr_year % 10:
        contract_year = math.ceil(curr_year / 10) * 10 + contract_year_first_digit
    else:
        contract_year = math.floor(curr_year / 10) * 10 + contract_year_first_digit
    return contract_year

fdf["contract_year"] = fdf.apply(get_contract_year, axis=1)
fdf["contract_month"] = fdf.apply(lambda row: future_month_map[row['symbol'][2]], axis=1)

In [6]:
# function to determine expiry length
def get_expiry_length(row):
    month = future_month_map[row['symbol'][2]]
    year = row["contract_year"]
    # Calculate the difference in months
    return (year - row.name.year) * 12 + month - row.name.month

fdf["expiry_length"] = fdf.apply(get_expiry_length, axis=1)

In [7]:
LENGTH = 6
frdf = fdf[fdf['expiry_length'] == LENGTH].copy()
sdf = fdf[fdf['expiry_length'] == (LENGTH + 1)].copy()

In [11]:
frdf.shape, sdf.shape

((2273, 12), (2251, 12))

In [70]:
import pandas as pd
import numpy as np

def create_continuous_contract_vectorized(combined_df, LENGTH, verbose=False):
    """
    Creates a continuous futures contract series using vectorized operations.
    """
    # Ensure the dataframe is sorted by date
    combined = combined_df.sort_index()

    # --- Step 1: Select the correct contract for each day ---
    # This replaces the entire loop and the if/else logic for contract selection.
    
    def select_contract(group):
        # Prioritize contract with the target expiry_length
        target_contract = group[group['expiry_length'] == LENGTH]
        if not target_contract.empty:
            return target_contract.iloc[0]
        
        # Fallback to the next length if the target is not found
        fallback_contract = group[group['expiry_length'] == LENGTH + 1]
        if not fallback_contract.empty:
            return fallback_contract.iloc[0]
        
        return None # Or handle cases where neither is found

    # Group by date and apply the selection logic
    # We use the date part of the index for grouping daily data
    daily_contracts = combined.groupby(combined.index.date).apply(select_contract)
    daily_contracts = daily_contracts.dropna() # Remove days where no contract was found

    # --- Step 2: Calculate the adjustments vectorially ---
    # Use .shift() to get the previous day's symbol and close price
    prev_symbol = daily_contracts['symbol'].shift(1)
    prev_close = daily_contracts['close'].shift(1)
    
    # A rollover occurs where the current symbol is different from the previous day's symbol
    is_rollover = (daily_contracts['symbol'] != prev_symbol) & (prev_symbol.notna())
    
    # Your original code had two adjustment cases. The most common case is
    # adjusting by the difference between the new contract's price and the
    # previous day's closing price. This is easily vectorized.
    # Note: The complex same-day adjustment requires a merge and is slower.
    # This standard method is often sufficient and much faster.
    adjustment_values = daily_contracts['close'] - prev_close
    
    # Use np.where to apply the adjustment only on rollover days, otherwise it's 0
    daily_contracts['adjustment'] = np.where(is_rollover, adjustment_values, 0)
    
    # The count of adjustments is simply the sum of the boolean 'is_rollover' Series
    if verbose:
        print(f'Percent of diff day adjustments {is_rollover.sum()}')

    return daily_contracts

In [71]:
test = create_continuous_contract_vectorized(fdf, 6, verbose=True)

Percent of diff day adjustments 110


In [39]:
def roll_contract(df, LENGTH, verbose=False):
    # filter for contracts with the target expiry LENGTH or the next one
    candidates = df[df['expiry_length'].isin([LENGTH, LENGTH + 1])].copy()
    
    # use a clean date column for daily selection
    candidates['date'] = pd.to_datetime(candidates.index.date)

    # for each day, prefer the contract with the shorter expiry LENGTH
    candidates.sort_values(by=['date', 'expiry_length'], inplace=True)
    ndf = candidates.drop_duplicates(subset='date', keep='first').set_index('date')

    if verbose:
        missing_days = ndf['close'].isna().sum()
        print(f"no valid contract found for {missing_days} business days before filling.")
    
    # identify the day before the contract roll to calculate the adjustment
    is_roll_day = ndf['symbol'] != ndf['symbol'].shift(-1)

    # calculate price adjustment using a forward-looking shift (-1)
    adjustment = np.where(
        is_roll_day,
        ndf['open'].shift(-1) - ndf['close'],
        0
    )
    ndf['adjustment'] = np.where(np.isnan(adjustment), 0, adjustment)
    
    # apply a reverse cumulative sum of adjustments to create a continuous series
    total_adjustment = ndf['adjustment'].iloc[::-1].cumsum().iloc[::-1]
    
    cols_to_adjust = ['open', 'high', 'low', 'close']
    ndf[cols_to_adjust] = ndf[cols_to_adjust].add(total_adjustment, axis=0)

    return ndf

In [40]:
kwk = roll_contract(fdf.iloc[:-10], 6, verbose=True)
fdf.index.nunique(), kwk.index.nunique()

no valid contract found for 0 business days before filling.


(4701, 4474)

In [42]:
kwk.iloc[-1]

rtype                   35
publisher_id             1
instrument_id     42028866
open                3401.3
high                3402.0
low                 3341.4
close               3344.7
volume                  23
symbol                GCF6
contract_year         2026
contract_month           1
expiry_length            6
adjustment             0.0
Name: 2025-07-29 00:00:00, dtype: object