for this file, I am going to try the group-9 methods again and to see if any improvements could be made to improve the replication quality on the yang table 1 results, and to replicate the he-kelly-manela commodities return results.

In [2]:
import pandas as pd
import load_futures_data

In [3]:
wrds_futures = load_futures_data.load_wrds_futures()

In [4]:
wrds_futures.head(5)

Unnamed: 0,futcode,date,settlement,contrdate,product_code
0,36580.0,1983-02-28,95.199997,983,3160
1,36580.0,1983-03-01,95.0,983,3160
2,36580.0,1983-03-02,95.599991,983,3160
3,36580.0,1983-03-03,96.0,983,3160
4,36580.0,1983-03-04,96.899994,983,3160


In [7]:
wrds_futures["contrdate"]

0          0983
1          0983
2          0983
3          0983
4          0983
           ... 
4921746    0627
4921747    0627
4921748    0627
4921749    0627
4921750    0627
Name: contrdate, Length: 4921751, dtype: object

In [None]:
def parse_contrdate(c):
    """
    Parse a contract date string (MMYY or MM/YY) into a monthly pandas.Period.

    Parameters
    ----------
    c : str
        Contract date in the format 'MMYY' or 'MM/YY'.

    Returns
    -------
    pandas.Period
        A monthly Period object corresponding to the parsed year and month.
    """
    try:
        c = c.replace('/', '')
        mm = int(c[:2])
        yy = int(c[2:])
        year = 2000 + yy if yy < 50 else 1900 + yy
        return pd.Period(f'{year}-{mm:02d}', freq='M')
    except:
        return pd.NaT  # or None



def futures_series_to_monthly(df):
    """
    Convert a daily futures DataFrame into a monthly frequency by taking
    the last available daily row for each (futcode, month). Also parses
    contract periods and drops date columns to produce a final monthly dataset.

    Parameters
    ----------
    df : pandas.DataFrame
        Must contain columns ['futcode', 'date_', 'contrdate', 'settlement'].

    Returns
    -------
    pandas.DataFrame
        Monthly data with columns ['futcode', 'contr_period', 'obs_period', 'settlement'].
        Each row corresponds to the last daily entry in that month for the given futcode.
    """

    
    df = df.sort_values(["futcode", "date"])
    monthly_df = df.groupby(["futcode", df["date"].dt.to_period("M")]).tail(1).copy()

    monthly_df["contr_period"] = monthly_df["contrdate"].apply(parse_contrdate)
    monthly_df["obs_period"]   = monthly_df["date"].dt.to_period("M")

    monthly_df = monthly_df.drop(columns=["date", "contrdate"])
    monthly_df = monthly_df.sort_values(by=["obs_period","contr_period"])
    return monthly_df

In [15]:
monthly_df = futures_series_to_monthly(wrds_futures)

In [17]:
monthly_df = monthly_df.drop(columns=["date", "contrdate"])
monthly_df = monthly_df[~monthly_df["settlement"].isna()]

In [18]:
monthly_df[monthly_df["contr_period"].isna()]

Unnamed: 0,futcode,settlement,product_code,contr_period,obs_period


In [None]:
#yang paper started at 01 1973
monthly_df

Unnamed: 0,futcode,settlement,product_code,contr_period,obs_period
4855081,203305.0,1.962,2108,1973-01,1973-01
4856597,205681.0,2.035,2108,1973-02,1973-01
4856539,205680.0,2.047,2108,1973-03,1973-01
530201,281455.0,161.500,3247,1973-03,1973-01
4857114,206007.0,2.069,2108,1973-05,1973-01
...,...,...,...,...,...
3336488,478793.0,3.327,2060,2037-08,2025-06
3336622,478794.0,3.342,2060,2037-09,2025-06
3336220,478791.0,3.388,2060,2037-10,2025-06
3336354,478792.0,3.460,2060,2037-11,2025-06


In [49]:

import numpy as np
def compute_monthly_basis(monthly_df):
    """
    Compute the monthly basis for each product and observation period.

    Parameters
    ----------
    monthly_df : pandas.DataFrame
        DataFrame with columns ['product_code', 'obs_period', 'contr_period', 'settlement'].
        Each row is a futures contract with a settlement price and a delivery date.

    Returns
    -------
    pandas.DataFrame
        DataFrame with columns ['product_code', 'obs_period', 'basis'] containing
        the log-slope basis for each product and observation period, annualized.
    """

    # Ensure correct types
    monthly_df = monthly_df.copy()
    # monthly_df["contr_period"] = pd.to_datetime(monthly_df["contr_period"]).dt.to_period("M")
    # monthly_df["obs_period"] = pd.to_datetime(monthly_df["obs_period"]).dt.to_period("M")

    # Calculate months to maturity
    monthly_df["months_to_maturity"] = (monthly_df["contr_period"] - monthly_df["obs_period"]).apply(lambda x: x.n)

    # Filter only contracts with 1 <= maturity <= 12
    valid_contracts = monthly_df[(monthly_df["months_to_maturity"] >= 1) & (monthly_df["months_to_maturity"] <= 12)]

    basis_list = []

    grouped = valid_contracts.groupby(["product_code", "obs_period"])

    for (product_code, obs_period), group in grouped:
        if group.shape[0] < 2:
            continue  # Skip if not enough contracts to compute slope

        group = group.sort_values("months_to_maturity")

        f1 = group.iloc[0]
        f2 = group.iloc[-1]

        T1 = f1["months_to_maturity"]
        T2 = f2["months_to_maturity"]

        try:
            log_diff = np.log(f1["settlement"]) - np.log(f2["settlement"])
            basis = log_diff / (T2 - T1)
            basis_annualized = basis * 12
            basis_list.append({
                "product_code": product_code,
                "obs_period": obs_period,
                "basis": basis_annualized
            })
        except (ValueError, ZeroDivisionError, FloatingPointError):
            continue

    basis_df = pd.DataFrame(basis_list)
    return basis_df

In [50]:
basis_df = compute_monthly_basis(monthly_df)

In [52]:
basis_df

Unnamed: 0,product_code,obs_period,basis
0,289,2005-09,0.063234
1,289,2005-10,0.031171
2,289,2005-11,-0.083072
3,289,2005-12,-0.105446
4,289,2006-01,-0.097137
...,...,...,...
14240,3847,2025-02,0.105739
14241,3847,2025-03,0.122052
14242,3847,2025-04,0.076983
14243,3847,2025-05,0.081453


In [53]:
def compute_summary_stats(basis_df: pd.DataFrame) -> pd.DataFrame:
    """
    Replicates Table 1 in Yang (2013) using basis data:
    - Basis (mean)
    - Frequency of backwardation
    - E[Re] = mean return
    - σ[Re] = std dev of return
    - Sharpe ratio

    Parameters
    ----------
    basis_df : pd.DataFrame
        Must contain columns ['product_code', 'obs_period', 'basis']

    Returns
    -------
    pd.DataFrame
        Summary statistics for each product
    """
    summary_list = []

    for code, group in basis_df.groupby("product_code"):
        basis = group["basis"].mean()
        n_obs = group["basis"].count()
        freq_bw = (group["basis"] < 0).mean() * 100
        expected_ret = group["basis"].mean()
        std_ret = group["basis"].std()
        sharpe = expected_ret / std_ret if std_ret != 0 else np.nan

        summary_list.append({
            "product_code": code,
            "N": n_obs,
            "Basis": basis,
            "Freq. of bw.": freq_bw,
            "E[Re]": expected_ret,
            "σ[Re]": std_ret,
            "Sharpe ratio": sharpe
        })

    return pd.DataFrame(summary_list)


In [54]:
summary_df = compute_summary_stats(basis_df)

In [55]:
summary_df

Unnamed: 0,product_code,N,Basis,Freq. of bw.,E[Re],σ[Re],Sharpe ratio
0,289,238,0.000449,52.941176,0.000449,0.145376,0.003089
1,361,539,-0.055462,69.758813,-0.055462,0.163281,-0.339673
2,379,348,-0.06898,74.712644,-0.06898,0.123851,-0.556961
3,385,439,-0.060689,69.476082,-0.060689,0.163921,-0.370235
4,396,439,-0.004226,60.136674,-0.004226,0.096733,-0.043692
5,430,439,-0.051907,69.248292,-0.051907,0.120332,-0.431364
6,1980,620,-0.011338,66.612903,-0.011338,0.12734,-0.089035
7,1986,508,0.019885,41.535433,0.019885,0.140639,0.14139
8,1992,558,-0.007023,60.394265,-0.007023,0.14125,-0.049723
9,2020,567,-0.046139,100.0,-0.046139,0.037549,-1.228758


In [38]:
PRODUCT_NAME_MAP = {
    3160: "Barley",               
    289:  "Butter",               
    3161: "Canola",               
    1980: "Cocoa",                
    2038: "Coffee",               
    3247: "Corn",                 
    1992: "Cotton",               
    361:  "Lumber",               
    385:  "Oats",                 
    2036: "Orange Juice",         
    379:  "Rough Rice",           
    3256: "Soybean Meal",         
    396:  "Soybeans",             
    430:  "Wheat",                
    1986: "Crude Oil",            
    2091: "Gasoline",             
    2029: "Heating Oil",          
    2060: "Natural Gas",          
    3847: "Propane",              
    2032: "Unleaded Gas",         
    3250: "Feeder Cattle",        
    2676: "Lean Hogs",            
    2675: "Live Cattle",          
    3126: "Aluminum",             
    2087: "Coal",                 
    2026: "Copper",               
    2020: "Gold",                 
    2065: "Palladium",            
    2074: "Platinum",             
    2108: "Silver"                
}

summary_df["Commodity"] = summary_df["product_code"].map(PRODUCT_NAME_MAP)

In [39]:
summary_df

Unnamed: 0,product_code,N,Basis,Freq. of bw.,E[Re],σ[Re],Sharpe ratio,Commodity
0,289,238,0.020511,42.016807,0.020511,0.113524,0.18068,Butter
1,361,540,-0.092013,69.074074,-0.092013,0.29274,-0.314317,Lumber
2,379,348,-0.110253,72.413793,-0.110253,0.211162,-0.522129,Rough Rice
3,385,439,-0.118217,68.56492,-0.118217,0.296307,-0.398967,Oats
4,396,439,0.005389,51.708428,0.005389,0.129279,0.041685,Soybeans
5,430,439,-0.068735,64.009112,-0.068735,0.217097,-0.316608,Wheat
6,1980,621,-0.02509,64.89533,-0.02509,0.233638,-0.10739,Cocoa
7,1986,508,0.017125,36.417323,0.017125,0.063536,0.269539,Crude Oil
8,1992,561,-0.012851,53.475936,-0.012851,0.187871,-0.068403,Cotton
9,2020,575,-0.104945,100.0,-0.104945,0.056055,-1.872182,Gold
