for this file, I am going to try the group-9 methods again and to see if any improvements could be made to improve the replication quality on the yang table 1 results, and to replicate the he-kelly-manela commodities return results.

In [56]:
import pandas as pd
import load_futures_data

In [3]:
wrds_futures = load_futures_data.load_wrds_futures()

In [4]:
wrds_futures.head(5)

Unnamed: 0,futcode,date,settlement,contrdate,product_code
0,36580.0,1983-02-28,95.199997,983,3160
1,36580.0,1983-03-01,95.0,983,3160
2,36580.0,1983-03-02,95.599991,983,3160
3,36580.0,1983-03-03,96.0,983,3160
4,36580.0,1983-03-04,96.899994,983,3160


In [7]:
wrds_futures["contrdate"]

0          0983
1          0983
2          0983
3          0983
4          0983
           ... 
4921746    0627
4921747    0627
4921748    0627
4921749    0627
4921750    0627
Name: contrdate, Length: 4921751, dtype: object

In [None]:
def parse_contrdate(c):
    """
    Parse a contract date string (MMYY or MM/YY) into a monthly pandas.Period.

    Parameters
    ----------
    c : str
        Contract date in the format 'MMYY' or 'MM/YY'.

    Returns
    -------
    pandas.Period
        A monthly Period object corresponding to the parsed year and month.
    """
    try:
        c = c.replace('/', '')
        mm = int(c[:2])
        yy = int(c[2:])
        year = 2000 + yy if yy < 50 else 1900 + yy
        return pd.Period(f'{year}-{mm:02d}', freq='M')
    except:
        return pd.NaT  # or None



def futures_series_to_monthly(df):
    """
    Convert a daily futures DataFrame into a monthly frequency by taking
    the last available daily row for each (futcode, month). Also parses
    contract periods and drops date columns to produce a final monthly dataset.

    Parameters
    ----------
    df : pandas.DataFrame
        Must contain columns ['futcode', 'date_', 'contrdate', 'settlement'].

    Returns
    -------
    pandas.DataFrame
        Monthly data with columns ['futcode', 'contr_period', 'obs_period', 'settlement'].
        Each row corresponds to the last daily entry in that month for the given futcode.
    """

    
    df = df.sort_values(["futcode", "date"])
    monthly_df = df.groupby(["futcode", df["date"].dt.to_period("M")]).tail(1).copy()

    monthly_df["contr_period"] = monthly_df["contrdate"].apply(parse_contrdate)
    monthly_df["obs_period"]   = monthly_df["date"].dt.to_period("M")

    monthly_df = monthly_df.drop(columns=["date", "contrdate"])
    monthly_df = monthly_df.sort_values(by=["obs_period","contr_period"])
    return monthly_df

In [15]:
monthly_df = futures_series_to_monthly(wrds_futures)

In [17]:
monthly_df = monthly_df.drop(columns=["date", "contrdate"])
monthly_df = monthly_df[~monthly_df["settlement"].isna()]

In [18]:
monthly_df[monthly_df["contr_period"].isna()]

Unnamed: 0,futcode,settlement,product_code,contr_period,obs_period


In [None]:
#yang paper started at 01 1973
monthly_df

Unnamed: 0,futcode,settlement,product_code,contr_period,obs_period
4855081,203305.0,1.962,2108,1973-01,1973-01
4856597,205681.0,2.035,2108,1973-02,1973-01
4856539,205680.0,2.047,2108,1973-03,1973-01
530201,281455.0,161.500,3247,1973-03,1973-01
4857114,206007.0,2.069,2108,1973-05,1973-01
...,...,...,...,...,...
3336488,478793.0,3.327,2060,2037-08,2025-06
3336622,478794.0,3.342,2060,2037-09,2025-06
3336220,478791.0,3.388,2060,2037-10,2025-06
3336354,478792.0,3.460,2060,2037-11,2025-06


In [None]:

import numpy as np
def compute_monthly_basis(monthly_df):
    """
    Compute the monthly basis for each product and observation period.

    Parameters
    ----------
    monthly_df : pandas.DataFrame
        DataFrame with columns ['product_code', 'obs_period', 'contr_period', 'settlement'].
        Each row is a futures contract with a settlement price and a delivery date.

    Returns
    -------
    pandas.DataFrame
        DataFrame with columns ['product_code', 'obs_period', 'basis'] containing
        the log-slope basis for each product and observation period, annualized.
    """

    # Ensure correct types
    monthly_df = monthly_df.copy()
    # monthly_df["contr_period"] = pd.to_datetime(monthly_df["contr_period"]).dt.to_period("M")
    # monthly_df["obs_period"] = pd.to_datetime(monthly_df["obs_period"]).dt.to_period("M")

    # Calculate months to maturity
    monthly_df["months_to_maturity"] = (monthly_df["contr_period"] - monthly_df["obs_period"]).apply(lambda x: x.n)

    # Filter only contracts with 1 <= maturity <= 12
    valid_contracts = monthly_df[(monthly_df["months_to_maturity"] >= 1) & (monthly_df["months_to_maturity"] <= 12)]

    basis_list = []

    grouped = valid_contracts.groupby(["product_code", "obs_period"])

    for (product_code, obs_period), group in grouped:
        if group.shape[0] < 2:
            continue  # Skip if not enough contracts to compute slope

        group = group.sort_values("months_to_maturity")

        f1 = group.iloc[0]
        f2 = group.iloc[-1]

        T1 = f1["months_to_maturity"]
        T2 = f2["months_to_maturity"]

        try:
            log_diff = np.log(f1["settlement"]) - np.log(f2["settlement"])
            # I times 100 here to adhere to the group-9 formula and it actually presents a better result.
            basis = log_diff / (T2 - T1) * 100
            basis_annualized = basis * 12
            basis_list.append({
                "product_code": product_code,
                "obs_period": obs_period,
                "basis": basis_annualized
            })
        except (ValueError, ZeroDivisionError, FloatingPointError):
            continue

    basis_df = pd.DataFrame(basis_list)
    return basis_df

In [58]:
basis_df = compute_monthly_basis(monthly_df)

In [59]:
basis_df

Unnamed: 0,product_code,obs_period,basis
0,289,2005-09,6.323404
1,289,2005-10,3.117058
2,289,2005-11,-8.307184
3,289,2005-12,-10.544613
4,289,2006-01,-9.713673
...,...,...,...
14240,3847,2025-02,10.573885
14241,3847,2025-03,12.205229
14242,3847,2025-04,7.698269
14243,3847,2025-05,8.145316


In [78]:
def compute_excess_returns(monthly_df):
    df = monthly_df[["product_code", "obs_period", "contr_period", "settlement"]].copy()
    df.rename(columns={"obs_period": "t", "contr_period": "T", "settlement": "F"}, inplace=True)
    df["maturity"] = (df["T"] - df["t"]).apply(lambda x: x.n)
    df = df[df["maturity"] > 1]

    df_shifted = df.copy()
    df_shifted["t"] = df_shifted["t"] - 1
    df_shifted.rename(columns={"F": "F_next"}, inplace=True)

    merged = pd.merge(
        df,
        df_shifted[["product_code", "t", "T", "F_next"]],
        on=["product_code", "t", "T"],
        how="inner"
    )

    merged["excess_ret"] = merged["F_next"] / merged["F"] - 1

    summary = merged.groupby("product_code")["excess_ret"].agg(
        mean_ret=lambda x: x.mean() * 12,
        std_ret=lambda x: x.std() * np.sqrt(12),
        n="count"
    ).reset_index()
    summary["Sharpe_ratio"] = summary["mean_ret"] / summary["std_ret"]
    return merged, summary


In [82]:
def compute_summary_stats(basis_df: pd.DataFrame, excess_return_df = excess_return_df ) -> pd.DataFrame:
    """
    Replicates Table 1 in Yang (2013) using basis data:
    - Basis (mean)
    - Frequency of backwardation
    - E[Re] = mean return
    - σ[Re] = std dev of return
    - Sharpe ratio

    Parameters
    ----------
    basis_df : pd.DataFrame
        Must contain columns ['product_code', 'obs_period', 'basis']

    Returns
    -------
    pd.DataFrame
        Summary statistics for each product
    """
    summary_list = []

    for code, group in basis_df.groupby("product_code"):
        basis = group["basis"].mean()
        n_obs = group["basis"].count()
        freq_bw = (group["basis"] > 0).mean() * 100
        expected_ret = excess_return_df["excess_return"].mean()
        std_ret = group["basis"].std()
        sharpe = expected_ret / std_ret if std_ret != 0 else np.nan

        summary_list.append({
            "product_code": code,
            "N": n_obs,
            "Basis": basis,
            "Freq. of bw.": freq_bw,
            # "E[Re]": expected_ret,
            # "σ[Re]": std_ret,
            # "Sharpe ratio": sharpe
        })

    return pd.DataFrame(summary_list)


In [72]:
sample_df = basis_df[(basis_df["obs_period"] >= "1970-01") & (basis_df["obs_period"] <= "2008-12")]

In [83]:
summary_df = compute_summary_stats(sample_df)

In [84]:
summary_df

Unnamed: 0,product_code,N,Basis,Freq. of bw.
0,289,40,-9.533634,12.5
1,361,369,-6.3725,30.894309
2,379,269,-7.535783,26.022305
3,385,360,-6.741845,29.444444
4,396,360,-2.235921,29.722222
5,430,360,-3.922842,36.388889
6,1980,422,-2.784429,25.829384
7,1986,310,4.056994,63.870968
8,1992,360,-2.793946,32.777778
9,2020,369,-6.086867,0.0


In [86]:
PRODUCT_NAME_MAP = {
    3160: "Barley",               
    289:  "Butter",               
    3161: "Canola",               
    1980: "Cocoa",                
    2038: "Coffee",               
    3247: "Corn",                 
    1992: "Cotton",               
    361:  "Lumber",               
    385:  "Oats",                 
    2036: "Orange Juice",         
    379:  "Rough Rice",           
    3256: "Soybean Meal",         
    396:  "Soybeans",             
    430:  "Wheat",                
    1986: "Crude Oil",            
    2091: "Gasoline",             
    2029: "Heating Oil",          
    2060: "Natural Gas",          
    3847: "Propane",              
    2032: "Unleaded Gas",         
    3250: "Feeder Cattle",        
    2676: "Lean Hogs",            
    2675: "Live Cattle",          
    3126: "Aluminum",             
    2087: "Coal",                 
    2026: "Copper",               
    2020: "Gold",                 
    2065: "Palladium",            
    2074: "Platinum",             
    2108: "Silver"                
}

summary_df["Commodity"] = summary_df["product_code"].map(PRODUCT_NAME_MAP)

In [87]:
summary_df

Unnamed: 0,product_code,N,Basis,Freq. of bw.,Commodity
0,289,40,-9.533634,12.5,Butter
1,361,369,-6.3725,30.894309,Lumber
2,379,269,-7.535783,26.022305,Rough Rice
3,385,360,-6.741845,29.444444,Oats
4,396,360,-2.235921,29.722222,Soybeans
5,430,360,-3.922842,36.388889,Wheat
6,1980,422,-2.784429,25.829384,Cocoa
7,1986,310,4.056994,63.870968,Crude Oil
8,1992,360,-2.793946,32.777778,Cotton
9,2020,369,-6.086867,0.0,Gold


In [89]:
import pandas as pd

# Manually reloading the structured data from the LaTeX table (N and Basis columns only)
data = [
    # Agriculture
    ["Barley", "WA", 235, -3.66],
    ["Butter", "02", 141, -3.68],
    ["Canola", "WC", 377, -2.98],
    ["Cocoa", "CC", 452, -2.61],
    ["Coffee", "KC", 420, -2.57],
    ["Corn", "C-", 468, -6.03],
    ["Cotton", "CT", 452, -1.75],
    ["Lumber", "LB", 468, -5.63],
    ["Oats", "O-", 468, -5.65],
    ["Orange Juice", "JO", 448, -3.08],
    ["Rough Rice", "RR", 265, -7.56],
    ["Soybean Meal", "SM", 468, 0.20],
    ["Soybeans", "S-", 468, -0.58],
    ["Wheat", "W-", 468, -2.88],
    # Energy
    ["Crude Oil", "CL", 295, 4.25],
    ["Gasoline", "RB", 275, 8.09],
    ["Heating Oil", "HO", 345, 1.49],
    ["Natural Gas", "NG", 216, -3.63],
    ["Propane", "PN", 247, 5.53],
    ["Unleaded Gas", "HU", 250, 8.62],
    # Livestock
    ["Broilers", "BR", 19, 4.58],
    ["Feeder Cattle", "FC", 443, 0.35],
    ["Lean Hogs", "LH", 468, 2.66],
    ["Live Cattle", "LC", 468, 0.46],
    # Metals
    ["Aluminum", "AL", 215, 1.06],
    ["Coal", "QL", 85, -1.55],
    ["Copper", "HG", 412, 0.52],
    ["Gold", "GC", 400, -6.24],
    ["Palladium", "PA", 362, -2.16],
    ["Platinum", "PL", 410, -3.21],
    ["Silver", "SI", 419, -6.51]
]

# Create DataFrame
paper_table = pd.DataFrame(data, columns=["Commodity", "Symbol", "N", "Basis"])


In [90]:
paper_table

Unnamed: 0,Commodity,Symbol,N,Basis
0,Barley,WA,235,-3.66
1,Butter,02,141,-3.68
2,Canola,WC,377,-2.98
3,Cocoa,CC,452,-2.61
4,Coffee,KC,420,-2.57
5,Corn,C-,468,-6.03
6,Cotton,CT,452,-1.75
7,Lumber,LB,468,-5.63
8,Oats,O-,468,-5.65
9,Orange Juice,JO,448,-3.08


In [95]:
comparison_df = summary_df.merge(paper_table,on="Commodity")
comparison_df["N_diff"] = comparison_df["N_x"] - comparison_df["N_y"]
comparison_df["Basis_diff"] = comparison_df["Basis_x"] - comparison_df["Basis_y"]
comparison_df["Basis_pct"] = comparison_df["Basis_diff"] / comparison_df["Basis_y"]
comparison_df["N_pct"] = comparison_df["N_diff"] / comparison_df["N_y"]
comparison_df[["Commodity","Basis_diff","Basis_pct","N_diff","N_pct"]]

Unnamed: 0,Commodity,Basis_diff,Basis_pct,N_diff,N_pct
0,Butter,-5.853634,1.590661,-101,-0.716312
1,Lumber,-0.7425,0.131883,-99,-0.211538
2,Rough Rice,0.024217,-0.003203,4,0.015094
3,Oats,-1.091845,0.193247,-108,-0.230769
4,Soybeans,-1.655921,2.855036,-108,-0.230769
5,Wheat,-1.042842,0.362098,-108,-0.230769
6,Cocoa,-0.174429,0.066831,-30,-0.066372
7,Crude Oil,-0.193006,-0.045413,15,0.050847
8,Cotton,-1.043946,0.596541,-92,-0.20354
9,Gold,0.153133,-0.024541,-31,-0.0775
