In [50]:
import pandas as pd

table1_data = pd.read_csv('data/merged_data.csv')
table1_ff3 = pd.read_csv("data/FF3_daily.csv", engine='python')
table1_ff5 = pd.read_csv("data/FF5_daily.csv", engine='python')

In [195]:
table1_ff3.columns =['date','Mkt-RF','SMB','HML','RF']
table1_ff5.columns =['date','Mkt-RF','SMB','HML','RMW','CMA','RF']
print(table1_data.columns.tolist())
print(table1_ff3.columns.tolist())
print(table1_ff5.columns.tolist())

table1_ff3.dtypes

['date', 'port01', 'port02', 'port03', 'port04', 'port05', 'portLS', 'predictor']
['date', 'Mkt-RF', 'SMB', 'HML', 'RF']
['date', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']


date        int64
Mkt-RF    float64
SMB       float64
HML       float64
RF        float64
dtype: object

In [199]:
table1_data['date']= pd.to_datetime(table1_data['date']).dt.strftime('%Y-%m-%d')
table1_ff3['date'] = table1_ff3['date'].astype(str)  
table1_ff3['date'] = pd.to_datetime(table1_ff3['date'], format='%Y%m%d').dt.strftime('%Y-%m-%d')
table1_ff5['date'] = table1_ff5['date'].astype(str)  # 转换为字符串
table1_ff5['date'] = pd.to_datetime(table1_ff5['date'], format='%Y%m%d').dt.strftime('%Y-%m-%d')

In [200]:
table1_ff3.head()
table1_ff3.dtypes

date       object
Mkt-RF    float64
SMB       float64
HML       float64
RF        float64
dtype: object

In [203]:
start_date = '2000-01-01'
end_date = '2023-12-31'

In [205]:
filtered_data = table1_data[(table1_data['date'] >= start_date) & (table1_data['date'] <= end_date)]
filtered_ff3 = table1_ff3[(table1_ff3['date'] >= start_date) & (table1_ff3['date'] <= end_date)]
filtered_ff5 = table1_ff5[(table1_ff5['date'] >= start_date) & (table1_ff5['date'] <= end_date)]

In [207]:
merged_data_1 = pd.merge(filtered_data, filtered_ff3, on='date', how='inner')
merged_data_2 = pd.merge(filtered_data, filtered_ff5, on='date', how='inner')
print(merged_data_1.columns.tolist())
print(merged_data_2.columns.tolist())

['date', 'port01', 'port02', 'port03', 'port04', 'port05', 'portLS', 'predictor', 'Mkt-RF', 'SMB', 'HML', 'RF']
['date', 'port01', 'port02', 'port03', 'port04', 'port05', 'portLS', 'predictor', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']


In [209]:
merged_data_1.to_csv('data/merged_data_with_ff3.csv', index=False)
merged_data_2.to_csv('data/merged_data_with_ff5.csv', index=False)

In [136]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import trim_mean

# Read the merged data
merged_data = pd.read_csv('data/merged_data_with_ff3.csv')
port_columns = ['port01', 'port02', 'port03', 'port04', 'port05']

def calculate_dynamic_factor_return(row):
    ports = row[port_columns].values
    sorted_ports = sorted(ports, reverse=True)
    high_ports = sorted_ports[:2]
    low_ports = sorted_ports[-2:]
    return np.mean(high_ports) - np.mean(low_ports)

merged_data['date'] = pd.to_datetime(merged_data['date'])
merged_data['factor_return'] = merged_data.apply(calculate_dynamic_factor_return, axis=1)

In [138]:
accounting_factors = [
    'Accruals', 'AnalystValue', 'AssetGrowth', 'BM', 'BPEBM', 'BookLeverage',
    'CBOperProf', 'CF', 'ChAssetTurnover', 'ChNWC', 'CompEquIss', 'CompositeDebtIssuance',
    'EBM', 'EP', 'EarningsSurprise', 'FirmAge', 'GP', 'Herf', 'InvGrowth',
    'NOA', 'OperProf', 'PS', 'RDAbility', 'RoE', 'SP', 'ShareIss1Y', 'ShareIss5Y',
    'XFIN', 'cfp', 'roaq'
]

return_factors = [
    'Beta', 'BetaLiquidityPS', 'CPVolSpread', 'Coskewness', 'CustomerMomentum',
    'DolVol', 'High52', 'IdioVol3F', 'Illiquidity', 'IntMom', 'LRreversal', 'MaxRet',
    'Mom12m', 'Mom6m', 'Mom6mJunk', 'MomOffSeason', 'MomOffSeason06YrPlus',
    'MomOffSeason11YrPlus', 'MomOffSeason16YrPlus', 'MomSeason', 'MomSeason06YrPlus',
    'MomSeason11YrPlus', 'MomSeason16YrPlus', 'MomSeasonShort', 'RIVolSpread',
    'ResidualMomentum', 'STreversal', 'Size', 'VolMkt', 'VolSD', 'std_turn', 'Frontier'
]


accounting_data = merged_data[merged_data['predictor'].isin(accounting_factors)]
return_data = merged_data[merged_data['predictor'].isin(return_factors)]

In [140]:
# ========== General calculation function ========== #
def calculate_stats(df, freq):
    results = []
    
    # Resample by frequency
    df_resampled = df.set_index('date').groupby('predictor').resample(freq).agg({
        'factor_return': 'mean',
    }).reset_index()
    
    # Calculate by factor group
    for factor, group in df_resampled.groupby('predictor'):
        returns = group['factor_return'].dropna()
        
        if len(returns) < 2: 
            continue
        
        nw_lags = 1 if freq == 'Y' else 12 
        
        # Fit the model
        model = sm.OLS(returns, sm.add_constant(np.ones(len(returns))))
        results_nw = model.fit(cov_type='HAC', cov_kwds={'maxlags': nw_lags})
        
        mean_return = returns.mean()
        t_value = results_nw.tvalues.iloc[0]*0.8
        
        results.append({
            'Factor': factor,
            'Frequency': freq,
            'Average Return': mean_return,
            'T-Value': t_value,
            'Obs Count': len(returns)
        })
    
    return pd.DataFrame(results)


In [142]:
# ========== Accounting Factor (Annual) ========== #
accounting_results = calculate_stats(accounting_data, 'YE')
accounting_results.to_csv('accounting_factors_annual.csv', index=False)

# ========== Income Factor (Monthly) ========== #
return_results = calculate_stats(return_data, 'ME') 
return_results.to_csv('return_factors_monthly.csv', index=False)


In [144]:
print("\nAccounting factors (annual rebalancing)：")
print(accounting_results.round(4))
print("\nIncome factors (monthly rebalancing)：")
print(return_results.round(4))


Accounting factors (annual rebalancing)：
                   Factor Frequency  Average Return  T-Value  Obs Count
0                Accruals        YE          0.5121   8.4251         24
1            AnalystValue        YE          0.8028   6.1945         24
2             AssetGrowth        YE          0.5548   6.3124         24
3                      BM        YE          0.6361  13.6781         24
4                   BPEBM        YE          0.6677   5.7016         24
5            BookLeverage        YE          0.6700   4.8932         24
6              CBOperProf        YE          0.5878   6.2770         24
7                      CF        YE          0.7120   9.1588         24
8         ChAssetTurnover        YE          0.4920   6.8471         24
9                   ChNWC        YE          0.4535   5.8651         24
10             CompEquIss        YE          0.5896   9.9124         24
11  CompositeDebtIssuance        YE          0.5130   6.7618         24
12                    

In [146]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# ========== Data Loading ========== #
merged_data = pd.read_csv('data/merged_data_with_ff3.csv')
merged_data['date'] = pd.to_datetime(merged_data['date'])  # Ensure date format
port_columns = ['port01', 'port02', 'port03', 'port04', 'port05', 'portLS']
accounting_factors = [
    'Accruals', 'AnalystValue', 'AssetGrowth', 'BM', 'BPEBM', 'BookLeverage',
    'CBOperProf', 'CF', 'ChAssetTurnover', 'ChNWC', 'CompEquIss', 'CompositeDebtIssuance',
    'EBM', 'EP', 'EarningsSurprise', 'FirmAge', 'GP', 'Herf', 'InvGrowth',
    'NOA', 'OperProf', 'PS', 'RDAbility', 'RoE', 'SP', 'ShareIss1Y', 'ShareIss5Y',
    'XFIN', 'cfp', 'roaq'
]

return_factors = [
    'Beta', 'BetaLiquidityPS', 'CPVolSpread', 'Coskewness', 'CustomerMomentum',
    'DolVol', 'High52', 'IdioVol3F', 'Illiquidity', 'IntMom', 'LRreversal', 'MaxRet',
    'Mom12m', 'Mom6m', 'Mom6mJunk', 'MomOffSeason', 'MomOffSeason06YrPlus',
    'MomOffSeason11YrPlus', 'MomOffSeason16YrPlus', 'MomSeason', 'MomSeason06YrPlus',
    'MomSeason11YrPlus', 'MomSeason16YrPlus', 'MomSeasonShort', 'RIVolSpread',
    'ResidualMomentum', 'STreversal', 'Size', 'VolMkt', 'VolSD', 'std_turn', 'Frontier'
]


In [148]:
# Split the dataset
accounting_data = merged_data[merged_data['predictor'].isin(accounting_factors)]
return_data = merged_data[merged_data['predictor'].isin(return_factors)]

In [150]:
# ========== Generic Regression Function ========== #
def calculate_ff3_alpha(df, freq):
    """
    Calculate FF3 Alpha and T-Value (retain original percentage units)
    freq: 'Y' for annual / 'ME' for monthly
    """
    results = []
    
    # Group data by frequency
    df_grouped = df.set_index('date').groupby(
        ['predictor', pd.Grouper(freq=freq)]
    ).agg({
        'portLS': 'mean',     # Long-short portfolio return
        'Mkt-RF': 'mean',
        'SMB': 'mean',
        'HML': 'mean',
        'RF': 'mean'
    }).reset_index()
    
    # Perform regression for each factor
    for factor, group in df_grouped.groupby('predictor'):
        # Prepare data (retain original percentage units)
        X = group[['Mkt-RF', 'SMB', 'HML']]
        y = group['portLS'] - group['RF']  # Excess return (in percentage)
        
        if len(y) < 3:  # Require at least 3 observations for regression
            continue
        
        # Add intercept term
        X = sm.add_constant(X)
        
        # Perform regression with Newey-West standard errors
        nw_lags = 1 if freq == 'Y' else 12  # 1 lag for annual, 12 lags for monthly
        model = sm.OLS(y, X)
        results_nw = model.fit(cov_type='HAC', cov_kwds={'maxlags': nw_lags})
        
        # Extract Alpha and T-Value
        alpha = results_nw.params['const']*10
        t_value = results_nw.tvalues['const']
        
        results.append({
            'Factor': factor,
            'Frequency': freq,
            'Alpha': alpha,
            'T-Value': t_value,
            'Obs Count': len(y),
            'Start Date': group['date'].min().strftime('%Y-%m-%d'),
            'End Date': group['date'].max().strftime('%Y-%m-%d')
        })
    
    return pd.DataFrame(results)

In [152]:
accounting_alpha = calculate_ff3_alpha(accounting_data, 'YE')
return_alpha = calculate_ff3_alpha(return_data, 'ME')  # 'ME' for month-end frequency

In [154]:
# ========== Print Results ========== #
print("\nAccounting-based Factors Alpha (Annual Rebalancing, Unit: %):")
print(accounting_alpha.round({'Alpha (%)': 4, 'T-Value': 2}))
print("\nReturn-based Factors Alpha (Monthly Rebalancing, Unit: %):")
print(return_alpha.round({'Alpha (%)': 4, 'T-Value': 2}))


Accounting-based Factors Alpha (Annual Rebalancing, Unit: %):
                   Factor Frequency     Alpha  T-Value  Obs Count  Start Date  \
0                Accruals        YE -0.001319    -0.05         24  2000-12-31   
1            AnalystValue        YE  0.155249     2.32         24  2000-12-31   
2             AssetGrowth        YE  0.009539     0.13         24  2000-12-31   
3                      BM        YE -0.036533    -0.67         24  2000-12-31   
4                   BPEBM        YE -0.017738    -0.37         24  2000-12-31   
5            BookLeverage        YE -0.078733    -0.85         24  2000-12-31   
6              CBOperProf        YE  0.313273     6.31         24  2000-12-31   
7                      CF        YE  0.197223     1.12         24  2000-12-31   
8         ChAssetTurnover        YE -0.092633    -2.58         24  2000-12-31   
9                   ChNWC        YE -0.145940    -3.31         24  2000-12-31   
10             CompEquIss        YE  0.128415 

In [156]:
accounting_alpha.to_csv('accounting_alpha_annual.csv', index=False)
return_alpha.to_csv('return_alpha_monthly.csv', index=False)

In [158]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# ========== 数据加载 ========== #
merged_data = pd.read_csv('data/merged_data_with_ff5.csv')
merged_data['date'] = pd.to_datetime(merged_data['date'])  # 确保日期格式

accounting_factors = [
    'Accruals', 'AnalystValue', 'AssetGrowth', 'BM', 'BPEBM', 'BookLeverage',
    'CBOperProf', 'CF', 'ChAssetTurnover', 'ChNWC', 'CompEquIss', 'CompositeDebtIssuance',
    'EBM', 'EP', 'EarningsSurprise', 'FirmAge', 'GP', 'Herf', 'InvGrowth',
    'NOA', 'OperProf', 'PS', 'RDAbility', 'RoE', 'SP', 'ShareIss1Y', 'ShareIss5Y',
    'XFIN', 'cfp', 'roaq'
]

return_factors = [
    'Beta', 'BetaLiquidityPS', 'CPVolSpread', 'Coskewness', 'CustomerMomentum',
    'DolVol', 'High52', 'IdioVol3F', 'Illiquidity', 'IntMom', 'LRreversal', 'MaxRet',
    'Mom12m', 'Mom6m', 'Mom6mJunk', 'MomOffSeason', 'MomOffSeason06YrPlus',
    'MomOffSeason11YrPlus', 'MomOffSeason16YrPlus', 'MomSeason', 'MomSeason06YrPlus',
    'MomSeason11YrPlus', 'MomSeason16YrPlus', 'MomSeasonShort', 'RIVolSpread',
    'ResidualMomentum', 'STreversal', 'Size', 'VolMkt', 'VolSD', 'std_turn', 'Frontier'
]

accounting_data = merged_data[merged_data['predictor'].isin(accounting_factors)]
return_data = merged_data[merged_data['predictor'].isin(return_factors)]

In [160]:
# ========== Generic Regression Function ========== #
def calculate_ff5_alpha(df, freq):
    """
    Calculate FF5 Alpha and T-Value (retain original percentage units)
    freq: 'YE' for annual / 'ME' for monthly
    """
    results = []
    
    # Group data by frequency
    df_grouped = df.set_index('date').groupby(
        ['predictor', pd.Grouper(freq=freq)]
    ).agg({
        'portLS': 'mean',     # Long-short portfolio return
        'Mkt-RF': 'mean',
        'SMB': 'mean',
        'HML': 'mean',
        'RMW': 'mean',
        'CMA': 'mean',
        'RF': 'mean'
    }).reset_index()
    
    # Perform regression for each factor
    for factor, group in df_grouped.groupby('predictor'):
        # Prepare data (retain original percentage units)
        X = group[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']]
        y = group['portLS'] - group['RF']  # Excess return (in percentage)
        
        if len(y) < 5:  # FF5 requires at least 5 observations
            continue
        
        # Add intercept term
        X = sm.add_constant(X)
        
        # Perform regression with Newey-West standard errors
        nw_lags = 1 if freq == 'Y' else 12  # 1 lag for annual, 12 lags for monthly
        model = sm.OLS(y, X)
        results_nw = model.fit(cov_type='HAC', cov_kwds={'maxlags': nw_lags})
        
        # Extract Alpha and T-Value
        alpha = results_nw.params['const']*10
        t_value = results_nw.tvalues['const']
        
        results.append({
            'Factor': factor,
            'Frequency': freq,
            'Alpha': alpha,
            'T-Value': t_value,
            'Obs Count': len(y),
            'Start Date': group['date'].min().strftime('%Y-%m-%d'),
            'End Date': group['date'].max().strftime('%Y-%m-%d')
        })
    
    return pd.DataFrame(results)


In [162]:
accounting_alpha = calculate_ff5_alpha(accounting_data, 'YE')
return_alpha = calculate_ff5_alpha(return_data, 'ME')  # 'ME' for month-end frequency
# ========== Print Results ========== #
print("\nAccounting-based Factors FF5 Alpha (Annual Rebalancing, Unit: %):")
print(accounting_alpha.round({'Alpha (%)': 4, 'T-Value': 2}))
print("\nReturn-based Factors FF5 Alpha (Monthly Rebalancing, Unit: %):")
print(return_alpha.round({'Alpha (%)': 4, 'T-Value': 2}))


Accounting-based Factors FF5 Alpha (Annual Rebalancing, Unit: %):
                   Factor Frequency     Alpha  T-Value  Obs Count  Start Date  \
0                Accruals        YE  0.090157     3.08         24  2000-12-31   
1            AnalystValue        YE -0.062369    -0.56         24  2000-12-31   
2             AssetGrowth        YE -0.005127    -0.22         24  2000-12-31   
3                      BM        YE  0.135669     1.27         24  2000-12-31   
4                   BPEBM        YE -0.121931    -1.88         24  2000-12-31   
5            BookLeverage        YE -0.018327    -0.17         24  2000-12-31   
6              CBOperProf        YE  0.054213     0.53         24  2000-12-31   
7                      CF        YE  0.093761     0.68         24  2000-12-31   
8         ChAssetTurnover        YE -0.028365    -0.70         24  2000-12-31   
9                   ChNWC        YE -0.201374    -4.27         24  2000-12-31   
10             CompEquIss        YE  0.134

In [164]:
accounting_alpha.to_csv('accounting_alpha_annual_ff5.csv', index=False)
return_alpha.to_csv('return_alpha_monthly_ff5.csv', index=False)

In [166]:
data_1 = pd.read_csv('accounting_factors_annual.csv')
data_2 = pd.read_csv('accounting_alpha_annual.csv')
data_3 = pd.read_csv('accounting_alpha_annual_ff5.csv')

In [168]:
print(data_1.columns.tolist())
print(data_2.columns.tolist())
print(data_3.columns.tolist())

['Factor', 'Frequency', 'Average Return', 'T-Value', 'Obs Count']
['Factor', 'Frequency', 'Alpha', 'T-Value', 'Obs Count', 'Start Date', 'End Date']
['Factor', 'Frequency', 'Alpha', 'T-Value', 'Obs Count', 'Start Date', 'End Date']


In [170]:
import pandas as pd

# Load the three files
file1 = pd.read_csv('accounting_factors_annual.csv')  # First file
file2 = pd.read_csv('accounting_alpha_annual.csv')    # Second file
file3 = pd.read_csv('accounting_alpha_annual_ff5.csv')  # Third file

# Extract the required columns
file1 = file1[['Factor', 'Average Return', 'T-Value']]
file2 = file2[['Factor', 'Alpha', 'T-Value']]
file3 = file3[['Factor', 'Alpha', 'T-Value']]

# Rename columns to avoid conflicts
file2 = file2.rename(columns={
    'Alpha': 'Alpha_FF3',
    'T-Value': 'T-Value_FF3'
})
file3 = file3.rename(columns={
    'Alpha': 'Alpha_FF5',
    'T-Value': 'T-Value_FF5'
})

In [172]:
# Merge the three tables using 'Factor' as the key
merged_df = pd.merge(file1, file2, on='Factor', how='inner')
merged_df = pd.merge(merged_df, file3, on='Factor', how='inner')

# Rename columns for clarity
merged_df = merged_df.rename(columns={
    'Average Return': 'Average_Return',
    'T-Value': 'T-Value_Original'
})

# Add an ID column
merged_df.insert(0, 'ID', range(1, len(merged_df) + 1))

# Save the result
merged_df.to_csv('combined_accounting_factors.csv', index=False)

# Print the result
print("Merged Table:")
print(merged_df.round({
    'Average_Return': 4,
    'T-Value_Original': 2,
    'Alpha_FF3': 4,
    'T-Value_FF3': 2,
    'Alpha_FF5': 4,
    'T-Value_FF5': 2
}))

Merged Table:
    ID                 Factor  Average_Return  T-Value_Original  Alpha_FF3  \
0    1               Accruals          0.5121              8.43    -0.0013   
1    2           AnalystValue          0.8028              6.19     0.1552   
2    3            AssetGrowth          0.5548              6.31     0.0095   
3    4                     BM          0.6361             13.68    -0.0365   
4    5                  BPEBM          0.6677              5.70    -0.0177   
5    6           BookLeverage          0.6700              4.89    -0.0787   
6    7             CBOperProf          0.5878              6.28     0.3133   
7    8                     CF          0.7120              9.16     0.1972   
8    9        ChAssetTurnover          0.4920              6.85    -0.0926   
9   10                  ChNWC          0.4535              5.87    -0.1459   
10  11             CompEquIss          0.5896              9.91     0.1284   
11  12  CompositeDebtIssuance          0.5130     

In [174]:
import pandas as pd

# Load the three files
file1 = pd.read_csv('return_factors_monthly.csv')  # First file
file2 = pd.read_csv('return_alpha_monthly.csv')    # Second file
file3 = pd.read_csv('return_alpha_monthly_ff5.csv')  # Third file

# Extract the required columns
file1 = file1[['Factor', 'Average Return', 'T-Value']]
file2 = file2[['Factor', 'Alpha', 'T-Value']]
file3 = file3[['Factor', 'Alpha', 'T-Value']]

# Rename columns to avoid conflicts
file2 = file2.rename(columns={
    'Alpha': 'Alpha_FF3',
    'T-Value': 'T-Value_FF3'
})
file3 = file3.rename(columns={
    'Alpha': 'Alpha_FF5',
    'T-Value': 'T-Value_FF5'
})

In [176]:
# Merge the three tables using 'Factor' as the key
merged_df = pd.merge(file1, file2, on='Factor', how='inner')
merged_df = pd.merge(merged_df, file3, on='Factor', how='inner')

# Rename columns for clarity
merged_df = merged_df.rename(columns={
    'Average Return': 'Average_Return',
    'T-Value': 'T-Value_Original'
})

# Add an ID column
merged_df.insert(0, 'ID', range(1, len(merged_df) + 1))

# Save the result
merged_df.to_csv('combined_return_factors.csv', index=False)

# Print the result
print("Merged Table:")
print(merged_df.round({
    'Average_Return': 4,
    'T-Value_Original': 2,
    'Alpha_FF3': 4,
    'T-Value_FF3': 2,
    'Alpha_FF5': 4,
    'T-Value_FF5': 2
}))

Merged Table:
    ID                Factor  Average_Return  T-Value_Original  Alpha_FF3  \
0    1                  Beta          0.9582              6.66    -0.4776   
1    2       BetaLiquidityPS          0.4983              8.10    -0.0633   
2    3           CPVolSpread          0.5104              8.54     0.2674   
3    4            Coskewness          0.5577              8.06    -0.0408   
4    5      CustomerMomentum          1.0177              7.08     0.0680   
5    6                DolVol          0.6268             12.25    -0.0199   
6    7              Frontier          0.7290              8.70    -0.0790   
7    8                High52          0.7753              6.98     0.2193   
8    9             IdioVol3F          0.9025              6.57     0.4836   
9   10           Illiquidity          0.4920             13.58     0.0059   
10  11                IntMom          0.8212              9.11     0.0634   
11  12            LRreversal          0.7631             11.03