In [41]:
import pandas as pd
import numpy as np
from scipy import stats

def read_excel_file(file_path):
    """
    读取Excel文件，处理编码问题
    """
    try:
        return pd.read_excel(file_path)
    except Exception as e:
        print(f"读取文件 {file_path} 时出错: {str(e)}")
        raise

def calculate_portfolio_metrics(returns_series, rf_series):
    """计算投资组合指标"""
    try:
        start_date = pd.Timestamp('2007-01-01')
        end_date = pd.Timestamp('2023-12-31')
        
        # 确保索引是datetime
        returns_series.index = pd.to_datetime(returns_series.index)
        rf_series.index = pd.to_datetime(rf_series.index)
        
        # 数据过滤
        returns = returns_series.loc[start_date:end_date].astype(float)
        rf_rates = rf_series.loc[start_date:end_date].astype(float)
        
        if len(returns) == 0:
            print("警告：所选时间范围内没有收益率数据")
            return {k: np.nan for k in ['mean_monthly_return', 'annualized_return', 'monthly_std_dev', 
                                      'annualized_std_dev', 't_statistic', 't_stat_p_value', 
                                      'annualized_sharpe_ratio', 'maximum_drawdown']}
        
        mean_return = returns.mean()
        std_dev = returns.std()
        t_stat = (mean_return) / (std_dev / np.sqrt(len(returns)))
        p_value = 2 * (1 - stats.t.cdf(abs(t_stat), len(returns)-1))
        
        annual_factor = 12
        annual_return = mean_return * annual_factor
        annual_rf = rf_rates.mean() * annual_factor
        annual_std = std_dev * np.sqrt(annual_factor)
        sharpe_ratio = (annual_return - annual_rf) / annual_std if annual_std != 0 else np.nan
        
        cumulative_returns = (1 + returns).cumprod()
        rolling_max = cumulative_returns.expanding().max()
        drawdowns = (cumulative_returns - rolling_max) / rolling_max
        max_drawdown = drawdowns.min()
        
        return {
            'mean_monthly_return': mean_return,
            'annualized_return': annual_return,
            'monthly_std_dev': std_dev,
            'annualized_std_dev': annual_std,
            't_statistic': t_stat,
            't_stat_p_value': p_value,
            'annualized_sharpe_ratio': sharpe_ratio,
            'maximum_drawdown': max_drawdown
        }
        
    except Exception as e:
        print(f"计算投资组合指标时出错: {str(e)}")
        return {k: np.nan for k in ['mean_monthly_return', 'annualized_return', 'monthly_std_dev', 
                                  'annualized_std_dev', 't_statistic', 't_stat_p_value', 
                                  'annualized_sharpe_ratio', 'maximum_drawdown']}

def calculate_ic(indicator_file, indicator_config, portfolio_name):
    """计算IC: (1/T) * sum(Corr(X_t, R_{t+1}))"""
    try:
        df = read_excel_file(indicator_file)
        df[indicator_config['date_col']] = pd.to_datetime(df[indicator_config['date_col']])
        
        # 时间范围过滤
        date_mask = (df[indicator_config['date_col']] >= '2007-01-01') & \
                   (df[indicator_config['date_col']] <= '2023-12-31')
        df = df[date_mask]
        
        if len(df) == 0:
            print(f"警告：{portfolio_name} 在指定时间范围内没有数据")
            return {'ic_mean': np.nan, 'ic_std': np.nan, 'ic_ir': np.nan}
        
        # 异常值处理
        df[indicator_config['value_col']] = df[indicator_config['value_col']].replace([np.inf, -np.inf], np.nan)
        df[indicator_config['return_col']] = df[indicator_config['return_col']].replace([np.inf, -np.inf], np.nan)
        
        # 截面标准化
        df['indicator_std'] = df.groupby(indicator_config['date_col'])[indicator_config['value_col']].transform(
            lambda x: (x - x.mean()) / x.std()
        )
        
        df = df.sort_values(indicator_config['date_col'])
        unique_dates = df[indicator_config['date_col']].unique()
        correlations = []
        
        for t in range(len(unique_dates)-1):
            current_date = unique_dates[t]
            next_date = unique_dates[t+1]
            
            current_period = df[df[indicator_config['date_col']] == current_date]
            next_period = df[df[indicator_config['date_col']] == next_date]
            
            merged = pd.merge(
                current_period[[indicator_config['stock_col'], 'indicator_std']],
                next_period[[indicator_config['stock_col'], indicator_config['return_col']]],
                on=indicator_config['stock_col']
            )
            
            if len(merged) >= 30:
                corr = stats.spearmanr(
                    merged['indicator_std'].dropna(), 
                    merged[indicator_config['return_col']].dropna()
                )[0]
                if not np.isnan(corr):
                    correlations.append(corr)
                    print(f"期间 {current_date.strftime('%Y-%m')} -> {next_date.strftime('%Y-%m')}: "
                          f"样本数={len(merged)}, IC={corr:.4f}")
        
        if not correlations:
            print(f"警告: {portfolio_name} 未能计算出有效的IC值")
            return {'ic_mean': np.nan, 'ic_std': np.nan, 'ic_ir': np.nan}
        
        ic_mean = np.mean(correlations)
        ic_std = np.std(correlations)
        ic_ir = ic_mean / ic_std if ic_std != 0 else 0
        
        print(f"\n{portfolio_name} IC统计:")
        print(f"计算期数: {len(correlations)}")
        print(f"IC均值: {ic_mean:.4f}")
        print(f"IC标准差: {ic_std:.4f}")
        print(f"IC_IR: {ic_ir:.4f}")
        
        return {
            'ic_mean': ic_mean,
            'ic_std': ic_std,
            'ic_ir': ic_ir
        }
        
    except Exception as e:
        print(f"IC计算出错: {str(e)}")
        return {'ic_mean': np.nan, 'ic_std': np.nan, 'ic_ir': np.nan}

def process_portfolios(portfolio_configs, rf_config, output_file):
    """处理投资组合并计算指标"""
    try:
        rf_df = pd.read_csv(rf_config['file'], encoding='utf-8-sig')
        rf_df[rf_config['date_col']] = pd.to_datetime(rf_df[rf_config['date_col']])
        rf_df = rf_df.set_index(rf_config['date_col'])
        rf_series = rf_df[rf_config['rate_col']]
        
        all_metrics = []
        
        for config in portfolio_configs:
            try:
                print(f"\n处理投资组合: {config['name']}")
                
                portfolio_df = read_excel_file(config['portfolio_file'])
                portfolio_df[config['portfolio']['date_col']] = pd.to_datetime(
                    portfolio_df[config['portfolio']['date_col']]
                )
                portfolio_df = portfolio_df.set_index(config['portfolio']['date_col']).sort_index()
                
                # 市值加权组合
                vw_metrics = calculate_portfolio_metrics(
                    portfolio_df[config['portfolio']['vw_return_col']], 
                    rf_series
                )
                
                # 计算IC（市值加权和等权重共用相同的IC）
                ic_metrics = calculate_ic(
                    config['indicator_file'],
                    config['indicator'],
                    config['name']
                )
                
                vw_metrics.update(ic_metrics)
                vw_metrics['portfolio_name'] = f"{config['name']}_VW"
                vw_metrics['weight_scheme'] = '市值加权'
                all_metrics.append(vw_metrics)
                
                # 等权重组合
                ew_metrics = calculate_portfolio_metrics(
                    portfolio_df[config['portfolio']['ew_return_col']], 
                    rf_series
                )
                
                ew_metrics.update(ic_metrics)  # 使用相同的IC
                ew_metrics['portfolio_name'] = f"{config['name']}_EW"
                ew_metrics['weight_scheme'] = '等权重'
                all_metrics.append(ew_metrics)
                
            except Exception as e:
                print(f"处理投资组合 {config['name']} 时出错: {str(e)}")
                continue
        
        if not all_metrics:
            raise ValueError("没有成功处理任何投资组合")
        
        results_df = pd.DataFrame(all_metrics)
        
        cols = ['portfolio_name', 'weight_scheme'] + [
            col for col in results_df.columns 
            if col not in ['portfolio_name', 'weight_scheme']
        ]
        results_df = results_df[cols]
        
        chinese_columns = {
            'portfolio_name': '投资组合',
            'weight_scheme': '权重方案',
            'mean_monthly_return': '月均收益率',
            'annualized_return': '年化收益率',
            'monthly_std_dev': '月度标准差',
            'annualized_std_dev': '年化标准差',
            't_statistic': 't统计量',
            't_stat_p_value': 'p值',
            'annualized_sharpe_ratio': '年化夏普比率',
            'maximum_drawdown': '最大回撤率',
            'ic_mean': 'IC均值',
            'ic_std': 'IC标准差',
            'ic_ir': 'IC_IR'
        }
        
        results_df = results_df.rename(columns=chinese_columns)
        
        percentage_cols = ['月均收益率', '年化收益率', '月度标准差', '年化标准差', '最大回撤率']
        for col in percentage_cols:
            results_df[col] = results_df[col].apply(lambda x: '{:.2%}'.format(x))
        
        decimal_cols = ['年化夏普比率', 't统计量', 'IC均值', 'IC标准差', 'IC_IR']
        for col in decimal_cols:
            results_df[col] = results_df[col].apply(lambda x: '{:.2f}'.format(x))
        
        results_df['p值'] = results_df['p值'].apply(lambda x: '{:.4f}'.format(x))
        
        # 保存结果
        results_df.to_csv(output_file, index=False, encoding='utf-8-sig')
        
        return results_df
        
    except Exception as e:
        print(f"处理过程出错: {str(e)}")
        raise

In [46]:
# 使用示例
if __name__ == "__main__":
    portfolio_configs = [
        {
            'name': 'dfin',
            'portfolio_file': 'dfin_standardized.xlsx',
            'indicator_file': 'dfin_standardized_indicators.xlsx',
            'portfolio': {
                'date_col': 'year_month',
                'vw_return_col': 'return_spread_mv',
                'ew_return_col': 'return_spread_eq'
            },
            'indicator': {
                'date_col': 'year_month',
                'stock_col': 'code',
                'value_col': 'indicator_value',
                'return_col': 'monthly_return'
            }
        },
        {
            'name': 'ami3',
            'portfolio_file': 'ami3.xlsx',
            'indicator_file': 'ami3_indicators.xlsx',
            'portfolio': {
                'date_col': 'year_month',
                'vw_return_col': 'return_spread_mv',
                'ew_return_col': 'return_spread_eq'
            },
           'indicator': {
                'date_col': 'year_month',
                'stock_col': 'code',
                'value_col': 'indicator_value',
                'return_col': 'monthly_return'
            }
        },
        {
            'name': 'ami6',
            'portfolio_file': 'ami6.xlsx',
            'indicator_file': 'ami6_indicators.xlsx',
            'portfolio': {
                'date_col': 'year_month',
                'vw_return_col': 'return_spread_mv',
                'ew_return_col': 'return_spread_eq'
            },
            'indicator': {
                'date_col': 'year_month',
                'stock_col': 'code',
                'value_col': 'indicator_value',
                'return_col': 'monthly_return'
            }
        },
        {
            'name': 'ami9',
            'portfolio_file': 'ami9.xlsx',
            'indicator_file': 'ami9_indicators.xlsx',
            'portfolio': {
                'date_col': 'year_month',
                'vw_return_col': 'return_spread_mv',
                'ew_return_col': 'return_spread_eq'
            },
            'indicator': {
                'date_col': 'year_month',
                'stock_col': 'code',
                'value_col': 'indicator_value',
                'return_col': 'monthly_return'
            }
        },
        {
            'name': 'ami12',
            'portfolio_file': 'ami12.xlsx',
            'indicator_file': 'ami12_indicators.xlsx',
            'portfolio': {
                'date_col': 'year_month',
                'vw_return_col': 'return_spread_mv',
                'ew_return_col': 'return_spread_eq'
            },
            'indicator': {
                'date_col': 'year_month',
                'stock_col': 'code',
                'value_col': 'indicator_value',
                'return_col': 'monthly_return'
            }
        }
    ]
    
    rf_config = {
        'file': 'monthly_risk_free.csv',
        'date_col': 'year_month',
        'rate_col': 'risk_free_month'
    }
    
    results = process_portfolios(
        portfolio_configs=portfolio_configs,
        rf_config=rf_config,
        output_file='portfolio_analysis_results.csv'
    )


处理投资组合: dfin
期间 2007-01 -> 2007-02: 样本数=1165, IC=0.0135
期间 2007-02 -> 2007-03: 样本数=1187, IC=0.0630
期间 2007-03 -> 2007-04: 样本数=1193, IC=-0.0205
期间 2007-04 -> 2007-05: 样本数=1169, IC=-0.0071
期间 2007-05 -> 2007-06: 样本数=1183, IC=-0.0913
期间 2007-06 -> 2007-07: 样本数=1191, IC=-0.0109
期间 2007-07 -> 2007-08: 样本数=1197, IC=-0.0250
期间 2007-08 -> 2007-09: 样本数=1190, IC=-0.0349
期间 2007-09 -> 2007-10: 样本数=1182, IC=-0.0388
期间 2007-10 -> 2007-11: 样本数=1177, IC=0.0710
期间 2007-11 -> 2007-12: 样本数=1182, IC=-0.0162
期间 2007-12 -> 2008-01: 样本数=1176, IC=-0.0397
期间 2008-01 -> 2008-02: 样本数=1172, IC=0.0151
期间 2008-02 -> 2008-03: 样本数=1173, IC=-0.0072
期间 2008-03 -> 2008-04: 样本数=1173, IC=-0.0132
期间 2008-04 -> 2008-05: 样本数=1172, IC=-0.0077
期间 2008-05 -> 2008-06: 样本数=1260, IC=0.0069
期间 2008-06 -> 2008-07: 样本数=1280, IC=0.0195
期间 2008-07 -> 2008-08: 样本数=1282, IC=0.0818
期间 2008-08 -> 2008-09: 样本数=1290, IC=0.0064
期间 2008-09 -> 2008-10: 样本数=1299, IC=0.0439
期间 2008-10 -> 2008-11: 样本数=1317, IC=-0.0601
期间 2008-11 -> 2008-12: 样本数=