In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from rich.console import Console
from rich.table import Table
from sklearn import clone
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
from typing import Dict, List, Tuple
import joblib
from scipy import stats
from scipy.optimize import minimize
from rich.progress import Progress
class OlympicMedalPredictor:
    def __init__(self):
        self.console = Console()
        self.models = {
            'gbm': GradientBoostingRegressor(
                n_estimators=200, 
                learning_rate=0.05,
                max_depth=4,
                random_state=42
            ),
            'rf': RandomForestRegressor(
                n_estimators=200,
                max_depth=6, 
                random_state=42
            ),
            'xgb': xgb.XGBRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=4,
                random_state=42
            ),
            'lgb': lgb.LGBMRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=4,
                min_child_samples=20,
                min_child_weight=1e-3,
                reg_lambda=1.0,
                reg_alpha=0.0,
                random_state=42
            )
        }
        self.model_weights = {}
        self.feature_importance = {}
        self.predictions_store = {}
        
    def prepare_data(self, features_df: pd.DataFrame, historical_data: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, pd.Series], List[str]]:
        """准备训练数据"""
        df = features_df.merge(
            historical_data[['NOC', 'Year', 'Gold', 'Total']], 
            on=['NOC', 'Year'], 
            how='left'
        )
        
        # 保存NOC列
        noc_series = df['NOC']
        
        # 分离特征和目标变量
        target_cols = ['Gold', 'Total']
        exclude_cols = ['NOC'] + target_cols
        feature_cols = [col for col in df.columns if col not in exclude_cols]
        
        X = df[feature_cols].astype(float)
        y = {
            'Gold': df['Gold'].astype(float),
            'Total': df['Total'].astype(float)
        }
        
        return X, y, feature_cols
    


    def _optimize_ensemble_weights(self, 
                                predictions: Dict[str, np.ndarray], 
                                target: pd.Series,
                                base_scores: Dict[str, float],
                                constraints: Dict) -> Dict[str, float]:
        """优化集成权重，考虑历史约束"""
        def objective(w):
            w = w / w.sum()
            ensemble_pred = np.zeros_like(list(predictions.values())[0])
            for i, (_, pred) in enumerate(predictions.items()):
                ensemble_pred += w[i] * pred
            
            # 添加约束惩罚
            penalty = 0
            if ensemble_pred.sum() < constraints['total_range'][0]:
                penalty += (constraints['total_range'][0] - ensemble_pred.sum()) ** 2
            elif ensemble_pred.sum() > constraints['total_range'][1]:
                penalty += (ensemble_pred.sum() - constraints['total_range'][1]) ** 2
            
            return -r2_score(target, ensemble_pred) + penalty * 0.1
        
        n_models = len(predictions)
        weights = np.array([base_scores[model] for model in predictions.keys()])
        weights = weights / weights.sum()
        
        constraints_opt = (
            {'type': 'eq', 'fun': lambda w: np.sum(w) - 1},
        )
        bounds = [(0, 1) for _ in range(n_models)]
        
        result = minimize(
            objective, 
            weights, 
            method='SLSQP',
            constraints=constraints_opt,
            bounds=bounds,
            options={'maxiter': 1000}
        )
        
        optimized_weights = result.x / result.x.sum()
        return dict(zip(predictions.keys(), optimized_weights))
    
    def _calculate_weights(self, scores: Dict[str, float]) -> Dict[str, float]:
        """计算模型权重"""
        total = sum(scores.values())
        weights = {model: score/total for model, score in scores.items()}
        return weights
    
    def train_models(self, X: pd.DataFrame, y: Dict[str, pd.Series]) -> Dict:
        trained_models = {}
        scores = {}
        
        historical_constraints = {
            'Gold': {
                'top_countries': {
                    'China': (35, 45), 'United States': (30, 40),
                    'Great Britain': (20, 30), 'ROC': (20, 30), 
                    'Japan': (20, 30)
                },
                'min': 0, 'max': 45, 'total_range': (300, 340)
            },
            'Total': {
                'top_countries': {
                    'China': (80, 100), 'United States': (80, 100),
                    'Great Britain': (50, 70), 'ROC': (50, 70),
                    'Japan': (50, 70)
                },
                'min': 0, 'max': 100, 'total_range': (950, 1100)
            }
        }
        
        with Progress() as progress:
            total_tasks = len(y) * len(self.models)
            train_progress = progress.add_task("[cyan]训练模型...", total=total_tasks)
            
            for target_name, target in y.items():
                if target is None:
                    continue
                    
                trained_models[target_name] = {}
                scores[target_name] = {}
                self.predictions_store[target_name] = {}
                self.model_weights[target_name] = {}
                
                tscv = TimeSeriesSplit(n_splits=5)
                
                for model_name, model in self.models.items():
                    model_scores = []
                    fold_predictions = []
                    fold_actuals = []
                    
                    # 模型参数设置
                    if model_name == 'xgb':
                        model.set_params(learning_rate=0.05, n_estimators=300)
                    elif model_name == 'lgb':
                        model.set_params(learning_rate=0.05, n_estimators=300) 
                    elif model_name == 'gbm':
                        model.set_params(learning_rate=0.05, n_estimators=300)
                    elif model_name == 'rf':
                        model.set_params(n_estimators=300)
                    
                    for train_idx, val_idx in tscv.split(X):
                        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                        y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]
                        
                        # 模型训练
                        if model_name in ['xgb', 'lgb']:
                            model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
                        else:
                            model.fit(X_train, y_train)
                        
                        pred = model.predict(X_val)
                        pred = np.clip(
                            pred,
                            historical_constraints[target_name]['min'],
                            historical_constraints[target_name]['max']
                        )
                        
                        fold_predictions.extend(pred)
                        fold_actuals.extend(y_val)
                        score = r2_score(y_val, pred)
                        model_scores.append(score)
                    
                    trained_models[target_name][model_name] = model
                    scores[target_name][model_name] = np.mean(model_scores)
                    
                    if hasattr(model, 'feature_importances_'):
                        self.feature_importance[f"{target_name}_{model_name}"] = pd.Series(
                            model.feature_importances_,
                            index=X.columns
                        ).sort_values(ascending=False)
                    
                    progress.update(train_progress, advance=1)
                
                weights = self._optimize_ensemble_weights(
                    {name: model.predict(X) for name, model in trained_models[target_name].items()},
                    target,
                    scores[target_name],
                    historical_constraints[target_name]
                )
                self.model_weights[target_name].update(weights)
        
        return trained_models

    def _calculate_optimal_weights(self, predictions: Dict, scores: Dict) -> Dict[str, float]:
        """计算最优权重"""
        base_weights = np.array([scores[model] for model in predictions.keys()])
        weights = base_weights / np.sum(base_weights)
        
        # 验证预测结果的表现
        ensemble_predictions = np.zeros_like(list(predictions.values())[0]['predictions'])
        for i, (model_name, _) in enumerate(predictions.items()):
            ensemble_predictions += weights[i] * predictions[model_name]['predictions']
        
        # 返回归一化的权重
        return dict(zip(predictions.keys(), weights))

    def predict_with_uncertainty(self, 
                            X_pred: pd.DataFrame, 
                            models: Dict, 
                            target_name: str,
                            n_iterations: int = 500) -> Tuple[np.ndarray, np.ndarray]:
        """改进的不确定性预测函数，添加历史约束"""
        predictions = []
        
        # 历史数据约束
        historical_constraints = {
            'Gold': {
                'top_countries': {
                    'China': (35, 45),
                    'United States': (30, 40),
                    'Great Britain': (20, 30),
                    'ROC': (20, 30),
                    'Japan': (20, 30)
                },
                'min': 0,
                'max': 45,
                'std_scale': 0.1
            },
            'Total': {
                'top_countries': {
                    'China': (80, 100),
                    'United States': (80, 100),
                    'Great Britain': (50, 70),
                    'ROC': (50, 70),
                    'Japan': (50, 70)
                },
                'min': 0,
                'max': 100,
                'std_scale': 0.15
            }
        }
        
        with Progress() as progress:
            predict_task = progress.add_task("[cyan]生成预测...", total=n_iterations)
            
            for _ in range(n_iterations):
                model_preds = {}
                for model_name, model in models[target_name].items():
                    # Bootstrap sampling with noise
                    indices = np.random.choice(len(X_pred), size=len(X_pred), replace=True)
                    X_sample = X_pred.iloc[indices]
                    
                    # 特征噪声
                    noise_scale = 0.03
                    feature_noise = np.random.normal(0, noise_scale, size=X_sample.shape)
                    X_noisy = X_sample + feature_noise * X_sample.std().values
                    
                    # 预测
                    pred = model.predict(X_noisy)
                    
                    # 应用历史约束
                    for country_idx, country in enumerate(X_pred.index):
                        if country in historical_constraints[target_name]['top_countries']:
                            min_val, max_val = historical_constraints[target_name]['top_countries'][country]
                            pred[country_idx] = np.clip(pred[country_idx], min_val, max_val)
                        else:
                            pred[country_idx] = np.clip(
                                pred[country_idx],
                                historical_constraints[target_name]['min'],
                                historical_constraints[target_name]['max']
                            )
                    
                    model_preds[model_name] = pred
                
                # 集成预测
                ensemble_pred = sum(
                    pred * self.model_weights[target_name][model_name]
                    for model_name, pred in model_preds.items()
                )
                
                predictions.append(ensemble_pred)
                progress.update(predict_task, advance=1)
        
        predictions = np.array(predictions)
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        
        # 调整不确定性估计
        for i, country in enumerate(X_pred.index):
            if country in historical_constraints[target_name]['top_countries']:
                std_pred[i] *= historical_constraints[target_name]['std_scale']
        
        return mean_pred, std_pred
    def identify_first_time_medals(self, 
                                 predictions: np.ndarray, 
                                 historical_data: pd.DataFrame,
                                 countries: List[str]) -> List[str]:
        """识别可能首次获得奖牌的国家"""
        # 获取历史上从未获得奖牌的国家
        historical_medals = historical_data.groupby('NOC')['Total'].sum()
        never_medaled = set(countries) - set(historical_medals[historical_medals > 0].index)
        
        # 预测值大于阈值的国家可能首次获得奖牌
        threshold = 0.5  # 可调整的阈值
        first_time_medals = []
        
        for country, pred in zip(countries, predictions):
            if country in never_medaled and pred > threshold:
                first_time_medals.append(country)
        
        return first_time_medals
    
    def predict_2028_olympics(self, features_df: pd.DataFrame, historical_data: pd.DataFrame) -> None:
        """预测2028年奥运会奖牌情况"""
        try:
            # 准备数据
            X, y, feature_cols = self.prepare_data(features_df, historical_data)
            
            # 训练模型
            self.console.print("\n[bold cyan]训练模型中...[/bold cyan]")
            trained_models = self.train_models(X, y)
            
            # 准备2028年预测数据
            X_2028 = self._prepare_2028_features(features_df[['NOC'] + feature_cols])
            
            # 获取预测用的特征矩阵(不含NOC)
            X_2028_features = X_2028[feature_cols]
            
            # 预测并计算不确定性
            self.console.print("\n[bold cyan]生成2028年预测...[/bold cyan]")
            results = {}
            for target in ['Gold', 'Total']:
                if target in trained_models:
                    mean_pred, std_pred = self.predict_with_uncertainty(
                        X_2028_features, 
                        trained_models, 
                        target
                    )
                    results[target] = {
                        'predictions': mean_pred,
                        'uncertainty': std_pred
                    }
            
            # 识别可能首次获得奖牌的国家
            first_time_medalists = self.identify_first_time_medals(
                results['Total']['predictions'],
                historical_data,
                X_2028['NOC'].unique()
            )
            
            # 输出预测结果
            self._display_predictions(results, X_2028['NOC'].unique(), first_time_medalists)
            
            # 保存模型和预测结果
            self._save_results(trained_models, results, X_2028['NOC'].unique())
            
        except Exception as e:
            self.console.print(f"[bold red]预测过程中出现错误: {str(e)}[/bold red]")
            raise e
    
    def _prepare_2028_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """准备2028年的特征数据"""
        # 复制最近一年的数据作为基础
        latest_year = X['Year'].max()
        X_2028 = X[X['Year'] == latest_year].copy()
        
        # 更新年份
        X_2028['Year'] = 2028
        
        # 更新时间相关特征
        time_features = ['Years_To_Next', 'Olympics_Since_Last']
        for col in time_features:
            if col in X_2028.columns:
                X_2028[col] = 4  # 假设是正常的奥运周期
        
        # 更新参与次数
        if 'Participation_Count' in X_2028.columns:
            X_2028['Participation_Count'] += 1
        
        return X_2028
    
    def _display_predictions(self, 
                        results: Dict, 
                        countries: np.ndarray, 
                        first_time_medalists: List[str]) -> None:
        """显示预测结果"""
        # 创建结果表格
        table = Table(title="2028洛杉矶奥运会奖牌预测")
        table.add_column("国家")
        table.add_column("预计金牌数")
        table.add_column("预计总奖牌数")
        table.add_column("预测不确定性")
        
        # 将countries转换为列表以使用index方法
        countries_list = list(countries)
        
        for i, country in enumerate(countries_list):
            gold_pred = f"{results['Gold']['predictions'][i]:.1f}"
            gold_std = f"±{results['Gold']['uncertainty'][i]:.1f}"
            total_pred = f"{results['Total']['predictions'][i]:.1f}"
            total_std = f"±{results['Total']['uncertainty'][i]:.1f}"
            
            table.add_row(
                country,
                f"{gold_pred} ({gold_std})",
                f"{total_pred} ({total_std})",
                "高" if country in first_time_medalists else "中"
            )
        
        self.console.print(table)
        
        # 显示首次获奖国家
        if first_time_medalists:
            self.console.print("\n[bold green]预计首次获得奖牌的国家:[/bold green]")
            for country in first_time_medalists:
                self.console.print(f"- {country}")
    
    def _save_results(self, 
                    models: Dict, 
                    results: Dict, 
                    countries: List[str]) -> None:
        """保存模型和预测结果"""
        # 创建保存目录
        save_dir = Path("models")
        save_dir.mkdir(exist_ok=True)
        
        # 保存模型
        for target, target_models in models.items():
            for model_name, model in target_models.items():
                joblib.dump(
                    model, 
                    save_dir / f"{target}_{model_name}_model.joblib"
                )
        
        # 保存预测结果
        predictions_df = pd.DataFrame({
            'Country': countries,
            'Predicted_Gold': results['Gold']['predictions'],
            'Gold_Uncertainty': results['Gold']['uncertainty'],
            'Predicted_Total': results['Total']['predictions'],
            'Total_Uncertainty': results['Total']['uncertainty']
        })
        
        # 同时保存为 CSV 和 Parquet 格式
        predictions_df.to_csv(save_dir / "predictions_2028.csv", index=False)
        predictions_df.to_parquet(save_dir / "predictions_2028.parquet", index=False)
        
        # 保存特征重要性
        importance_df = pd.DataFrame(self.feature_importance)
        importance_df.to_csv(save_dir / "feature_importance.csv", index=True)
        importance_df.to_parquet(save_dir / "feature_importance.parquet", index=True)

    def generate_summary_report(self, predictions_df: pd.DataFrame, historical_data: pd.DataFrame) -> str:
        """生成详细的预测评估报告"""
        summary = []
        
        # 1. 基本统计分析
        total_countries = len(predictions_df)
        avg_gold = predictions_df['Predicted_Gold'].mean()
        avg_total = predictions_df['Predicted_Total'].mean()
        total_gold = predictions_df['Predicted_Gold'].sum()
        total_medals = predictions_df['Predicted_Total'].sum()
        
        summary.append("1. 基本统计分析")
        summary.append(f"   - 预测国家数量: {total_countries}")
        summary.append(f"   - 平均预测金牌数: {avg_gold:.2f}")
        summary.append(f"   - 平均预测总奖牌数: {avg_total:.2f}")
        summary.append(f"   - 预测总金牌数: {total_gold:.2f}")
        summary.append(f"   - 预测总奖牌数: {total_medals:.2f}")
        
        # 2. 历史趋势分析
        recent_years = historical_data['Year'].unique()[-3:]
        historical_trends = []
        for year in recent_years:
            year_data = historical_data[historical_data['Year'] == year]
            historical_trends.append({
                'year': year,
                'total_gold': year_data['Gold'].sum(),
                'total_medals': year_data['Total'].sum()
            })
        
        summary.append("\n2. 历史趋势分析")
        for trend in historical_trends:
            summary.append(f"   - {trend['year']}年:")
            summary.append(f"     * 总金牌数: {trend['total_gold']}")
            summary.append(f"     * 总奖牌数: {trend['total_medals']}")
        
        # 3. 预测可信度评估
        gold_uncertainty = predictions_df['Gold_Uncertainty'].mean()
        total_uncertainty = predictions_df['Total_Uncertainty'].mean()
        
        summary.append("\n3. 预测可信度评估")
        summary.append(f"   - 金牌预测平均不确定性: ±{gold_uncertainty:.2f}")
        summary.append(f"   - 总奖牌预测平均不确定性: ±{total_uncertainty:.2f}")
        
        # 4. 主要发现
        summary.append("\n4. 主要发现")
        summary.append("   - 预测趋势与历史数据对比")
        summary.append("   - 国家间竞争格局变化")
        summary.append("   - 新兴运动强国分析")
        
        # 5. 预测局限性
        summary.append("\n5. 预测局限性")
        summary.append("   - 模型假设和约束")
        summary.append("   - 不确定性来源")
        summary.append("   - 潜在影响因素")
        
        return "\n".join(summary)
def main():
    console = Console()
    
    try:
        # 创建保存目录
        Path("models").mkdir(exist_ok=True)
        
        # 加载数据
        console.print("[bold cyan]加载数据...[/bold cyan]")
        
        # 尝试不同的数据加载方式
        def load_data(file_path_base):
            """尝试多种方式加载数据"""
            # 尝试不同的文件扩展名和编码
            attempts = [
                (f"{file_path_base}.parquet", lambda x: pd.read_parquet(x)),
                (f"{file_path_base}.csv", lambda x: pd.read_csv(x)),
                (f"{file_path_base}.csv", lambda x: pd.read_csv(x, encoding='utf-8')),
                (f"{file_path_base}.csv", lambda x: pd.read_csv(x, encoding='latin1'))
            ]
            
            last_error = None
            for file_path, reader in attempts:
                try:
                    if Path(file_path).exists():
                        data = reader(file_path)
                        console.print(f"[green]成功从 {file_path} 加载数据[/green]")
                        return data
                except Exception as e:
                    last_error = e
                    continue
            
            raise FileNotFoundError(f"无法加载数据文件 {file_path_base}.*\n最后的错误: {str(last_error)}")
        
        # 加载特征数据
        features_df = load_data("data/processed/features")
        historical_data = load_data("data/processed/medal_counts")
        
        # 数据验证
        required_columns = ['Year', 'NOC', 'Gold', 'Total']
        for col in required_columns:
            if col not in historical_data.columns:
                raise ValueError(f"历史数据缺少必要的列: {col}")
        
        # 显示数据基本信息
        console.print("\n[bold green]数据加载完成[/bold green]")
        console.print(f"特征数据形状: {features_df.shape}")
        console.print(f"特征列: {', '.join(features_df.columns)}")
        console.print(f"历史数据形状: {historical_data.shape}")
        console.print(f"历史数据列: {', '.join(historical_data.columns)}")
        
        # 检查数据质量
        console.print("\n[bold cyan]检查数据质量...[/bold cyan]")
        
        # 检查缺失值
        missing_features = features_df.isnull().sum()
        if missing_features.any():
            console.print("[yellow]特征数据中存在缺失值:[/yellow]")
            console.print(missing_features[missing_features > 0])
        
        missing_historical = historical_data.isnull().sum()
        if missing_historical.any():
            console.print("[yellow]历史数据中存在缺失值:[/yellow]")
            console.print(missing_historical[missing_historical > 0])
        
        # 初始化预测器
        predictor = OlympicMedalPredictor()
        
        # 运行预测
        predictor.predict_2028_olympics(features_df, historical_data)
        # 加载预测结果
        predictions_df = pd.read_parquet("models/predictions_2028.parquet")
        
        # 生成摘要报告
        summary_report = predictor.generate_summary_report(predictions_df, historical_data)
        
        # 保存报告
        with open("models/prediction_summary_report.txt", "w") as f:
            f.write(summary_report)
        
        console.print("\n[bold cyan]预测评估摘要:[/bold cyan]")
        console.print(summary_report)
        # 保存结果
        console.print("\n[bold green]预测完成！结果已保存到 models 目录[/bold green]")
        
    except Exception as e:
        console.print(f"[bold red]错误: {str(e)}[/bold red]")
        import traceback
        console.print(traceback.format_exc())
        raise e

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from rich.console import Console
from rich.table import Table
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from typing import Dict, List, Tuple
import seaborn as sns
import matplotlib.pyplot as plt
from dataclasses import dataclass
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

@dataclass
class CountryInsight:
    noc: str
    trend_score: float
    stability_score: float
    diversity_score: float
    key_findings: List[str]
    recommendations: List[str]

class OlympicMedalInsightAnalyzer:
    def __init__(self):
        self.console = Console()
        self.insights = {}
        self.trends = {}
        self.patterns = {}
        
    def load_and_prepare_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """加载和准备分析所需的数据"""
        try:
            def try_load_data(file_path_base: str) -> pd.DataFrame:
                """尝试多种方式加载数据"""
                base_path = Path(file_path_base)
                data_dir = base_path.parent
                data_dir.mkdir(parents=True, exist_ok=True)
                
                attempts = [
                    (base_path.with_suffix('.parquet'), lambda x: pd.read_parquet(x)),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x)),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x, encoding='utf-8')),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x, encoding='latin1')),
                    (base_path.parent / f"{base_path.name}.parquet", lambda x: pd.read_parquet(x)),
                    (base_path.parent / f"{base_path.name}.csv", lambda x: pd.read_csv(x)),
                    (base_path.parent / f"{base_path.name}.xlsx", lambda x: pd.read_excel(x))
                ]
                
                errors = []
                for file_path, reader in attempts:
                    try:
                        if file_path.exists():
                            data = reader(file_path)
                            self.console.print(f"[green]成功从 {file_path} 加载数据[/green]")
                            return data
                    except Exception as e:
                        errors.append(f"{file_path}: {str(e)}")
                        continue
                
                available_files = list(data_dir.glob("*")) if data_dir.exists() else []
                files_str = "\n".join(f"- {f.name}" for f in available_files) if available_files else "目录为空"
                
                error_msg = (
                    f"无法加载数据文件 {base_path}.*\n"
                    f"尝试的路径:\n{chr(10).join(f'- {err}' for err in errors)}\n"
                    f"目录 {data_dir} 中的文件:\n{files_str}"
                )
                raise FileNotFoundError(error_msg)

            # 加载数据
            medals_df = try_load_data("data/processed/medal_counts")
            athletes_df = try_load_data("data/processed/athletes")
            programs_df = try_load_data("data/processed/programs")
            
            # 数据预处理
            for df in [medals_df, athletes_df, programs_df]:
                if 'Year' in df.columns:
                    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
                    
            # 移除重复项
            medals_df = medals_df.drop_duplicates()
            athletes_df = athletes_df.drop_duplicates()
            programs_df = programs_df.drop_duplicates()
            
            # 验证必要的列
            required_cols = {
                'medals_df': ['Year', 'NOC', 'Gold', 'Total'],
                'athletes_df': ['Year', 'NOC', 'Sport'],
                'programs_df': ['Sport', 'Discipline']
            }
            
            for df_name, df in [
                ('medals_df', medals_df), 
                ('athletes_df', athletes_df), 
                ('programs_df', programs_df)
            ]:
                missing_cols = [col for col in required_cols[df_name] if col not in df.columns]
                if missing_cols:
                    raise ValueError(f"{df_name} 缺少必要的列: {', '.join(missing_cols)}")
            
            return medals_df, athletes_df, programs_df
            
        except Exception as e:
            self.console.print(f"[bold red]数据加载错误: {str(e)}[/bold red]")
            raise

    def analyze_medal_trends(self, medals_df: pd.DataFrame, athletes_df: pd.DataFrame) -> Dict:
        """分析奖牌趋势和模式"""
        # 预处理：过滤历史国家
        historical_nocs = {
            'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC',
            'Soviet Union', 'East Germany', 'West Germany', 'Unified Team'
        }
        medals_df = medals_df[~medals_df['NOC'].isin(historical_nocs)].copy()
        athletes_df = athletes_df[~athletes_df['NOC'].isin(historical_nocs)].copy()
        trends = {}
        
        # 1. 整体趋势分析
        overall_trends = {
            'total_countries': medals_df['NOC'].nunique(),
            'medals_concentration': self._calculate_medals_concentration(medals_df),
            'emerging_countries': self._identify_emerging_countries(medals_df),
            'declining_countries': self._identify_declining_countries(medals_df)
        }
        trends['overall'] = overall_trends
        
        # 2. 区域性分析
        region_mapping = self._create_region_mapping()
        medals_df['Region'] = medals_df['NOC'].map(region_mapping)
        regional_trends = self._analyze_regional_patterns(medals_df)
        trends['regional'] = regional_trends
        
        # 3. 项目多样性分析 - 使用 athletes_df 而不是 medals_df
        diversity_trends = self._analyze_sport_diversity(athletes_df)
        trends['diversity'] = diversity_trends
        
        return trends

    def _calculate_medals_concentration(self, data: pd.DataFrame) -> Dict:
        """计算奖牌集中度趋势"""
        concentration = {}
        
        # 按时期分析
        periods = [(1896, 1950), (1951, 2000), (2001, 2024)]
        
        for start, end in periods:
            period_data = data[(data['Year'] >= start) & (data['Year'] <= end)]
            total_medals = period_data.groupby('NOC')['Total'].sum()
            
            # 计算基尼系数
            gini = self._calculate_gini(total_medals.values)
            
            # 计算前10国家占比
            top_10_share = total_medals.nlargest(10).sum() / total_medals.sum()
            
            concentration[f"{start}-{end}"] = {
                'gini_coefficient': gini,
                'top_10_share': top_10_share,
                'total_countries': len(total_medals)
            }
            
        return concentration

    def _calculate_gini(self, array: np.ndarray) -> float:
        """计算基尼系数"""
        array = array.flatten()
        if len(array) == 0:
            return 0
        array = np.sort(array)
        index = np.arange(1, len(array) + 1)
        n = len(array)
        return ((2 * index - n - 1) * array).sum() / (n * array.sum())
    def _identify_emerging_countries(self, data: pd.DataFrame, recent_years: int = 12) -> List[Dict]:
        """改进的新兴国家识别算法，使用复合指标"""
        historical_nocs = {'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC'}
        data = data[~data['NOC'].isin(historical_nocs)].copy()
        
        def calculate_growth_score(early_medals: float, recent_medals: float) -> float:
            """使用对数增长率避免除零问题"""
            if early_medals == 0:
                early_medals = 0.5  # Laplace平滑
            return np.log((recent_medals + 1) / (early_medals + 1))
        
        def calculate_momentum_score(country_data: pd.DataFrame) -> float:
            """计算动量得分"""
            if len(country_data) < 3:
                return 0
            
            # 使用指数加权移动平均
            weights = np.exp(np.linspace(-1, 0, len(country_data)))
            weighted_avg = np.average(country_data['Total'], weights=weights)
            return weighted_avg
        
        emerging_metrics = []
        max_year = data['Year'].max()
        
        for noc in data['NOC'].unique():
            country_data = data[data['NOC'] == noc]
            
            # 分割时期
            recent_data = country_data[country_data['Year'] >= (max_year - recent_years)]
            early_data = country_data[country_data['Year'] < (max_year - recent_years)]
            
            if len(recent_data) < 2 or len(early_data) < 2:
                continue
                
            # 计算复合指标
            growth_score = calculate_growth_score(
                early_data['Total'].mean(),
                recent_data['Total'].mean()
            )
            
            momentum_score = calculate_momentum_score(recent_data)
            
            # 计算稳定性
            stability = 1 / (1 + recent_data['Total'].std())
            
            # 综合得分
            final_score = (
                growth_score * 0.4 +
                momentum_score * 0.4 +
                stability * 0.2
            )
            
            if final_score > 0:
                emerging_metrics.append({
                    'NOC': noc,
                    'growth_rate': float(growth_score),
                    'momentum': float(momentum_score),
                    'stability': float(stability),
                    'final_score': float(final_score),
                    'recent_medals': float(recent_data['Total'].mean()),
                    'early_medals': float(early_data['Total'].mean())
                })
        
        # 按综合得分排序
        return sorted(emerging_metrics, key=lambda x: x['final_score'], reverse=True)[:10]
    # def _identify_emerging_countries(self, data: pd.DataFrame, recent_years: int = 12) -> List[Dict]:
    #     """改进的新兴国家识别算法"""
    #     # 数据预处理
    #     historical_nocs = {'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC'}
    #     data = data[~data['NOC'].isin(historical_nocs)].copy()
        
    #     # 计算时期表现
    #     max_year = data['Year'].max()
    #     recent_data = data[data['Year'] >= (max_year - recent_years)]
    #     early_data = data[data['Year'] < (max_year - recent_years)]
        
    #     # 计算平均值
    #     recent_avg = recent_data.groupby('NOC')['Total'].mean()
    #     early_avg = early_data.groupby('NOC')['Total'].mean()
        
    #     # 计算增长
    #     growth_df = pd.DataFrame({
    #         'recent': recent_avg,
    #         'early': early_avg
    #     }).fillna(0)
        
    #     # 改进的增长率计算
    #     growth_df['growth_rate'] = (
    #         (growth_df['recent'] - growth_df['early']) / 
    #         (growth_df['early'].replace(0, 0.1))  # 避免除以0
    #     ) * 100
        
    #     # 限制极端值
    #     growth_df['growth_rate'] = growth_df['growth_rate'].clip(-1000, 1000)
        
    #     # 筛选条件
    #     emerging = growth_df[
    #         (growth_df['growth_rate'] > 20) &  # 显著增长
    #         (growth_df['recent'] >= 3) &       # 当前有一定规模
    #         (growth_df['recent'] > growth_df['early'])  # 确实在增长
    #     ].sort_values('growth_rate', ascending=False)
        
    #     return [
    #         {
    #             'NOC': noc,
    #             'growth_rate': float(emerging.loc[noc, 'growth_rate']),
    #             'recent_medals': float(emerging.loc[noc, 'recent']),
    #             'early_medals': float(growth_df.loc[noc, 'early'])
    #         }
    #         for noc in emerging.index[:10]
    #     ]

    def _identify_declining_countries(self, data: pd.DataFrame, recent_years: int = 12) -> List[Dict]:
        """改进的衰退国家识别"""
        # 过滤历史国家
        historical_nocs = {'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC', 
                        'Soviet Union', 'East Germany', 'West Germany', 'Unified Team'}
        
        data = data[~data['NOC'].isin(historical_nocs)].copy()
        
        # 计算各个时期的表现
        recent_data = data[data['Year'] >= data['Year'].max() - recent_years]
        early_data = data[data['Year'] < data['Year'].max() - recent_years]
        
        # 使用更复杂的衰退指标
        decline_metrics = []
        
        for noc in data['NOC'].unique():
            noc_recent = recent_data[recent_data['NOC'] == noc]
            noc_early = early_data[early_data['NOC'] == noc]
            
            if len(noc_recent) < 2 or len(noc_early) < 2:
                continue
            
            # 计算多个维度的衰退指标
            medal_decline = (noc_early['Total'].mean() - noc_recent['Total'].mean()) / (noc_early['Total'].mean() + 1)
            consistency_decline = noc_recent['Total'].std() / (noc_recent['Total'].mean() + 1) - \
                                noc_early['Total'].std() / (noc_early['Total'].mean() + 1)
            peak_decline = noc_early['Total'].max() - noc_recent['Total'].max()
            
            # 仅考虑最近仍有参与的国家
            if noc_recent['Year'].max() < data['Year'].max() - 4:
                continue
                
            # 综合衰退分数
            decline_score = (medal_decline * 0.5 + 
                            consistency_decline * 0.3 + 
                            (peak_decline / (noc_early['Total'].max() + 1)) * 0.2)
            
            if decline_score > 0.2:  # 显著衰退阈值
                decline_metrics.append({
                    'NOC': noc,
                    'decline_rate': float(decline_score),
                    'recent_medals': float(noc_recent['Total'].mean()),
                    'early_medals': float(noc_early['Total'].mean()),
                    'peak_medals': float(noc_early['Total'].max()),
                    'last_appearance': int(noc_recent['Year'].max())
                })
        
        return sorted(decline_metrics, key=lambda x: x['decline_rate'], reverse=True)[:10]

    def _create_region_mapping(self) -> Dict[str, str]:
        """改进的区域映射，覆盖更多国家"""
        region_mapping = {
            # 北美洲
            'USA': 'North America', 'CAN': 'North America', 'MEX': 'North America',
            'PUR': 'North America', 'CUB': 'North America', 'JAM': 'North America',
            # 南美洲
            'BRA': 'South America', 'ARG': 'South America', 'CHI': 'South America',
            'COL': 'South America', 'VEN': 'South America', 'PER': 'South America',
            'ECU': 'South America', 'URU': 'South America', 'BOL': 'South America',
            # 欧洲
            'GBR': 'Europe', 'FRA': 'Europe', 'GER': 'Europe', 'ITA': 'Europe',
            'ESP': 'Europe', 'NED': 'Europe', 'SWE': 'Europe', 'NOR': 'Europe',
            'DEN': 'Europe', 'FIN': 'Europe', 'BEL': 'Europe', 'AUT': 'Europe',
            'SUI': 'Europe', 'POL': 'Europe', 'HUN': 'Europe', 'CZE': 'Europe',
            'GRE': 'Europe', 'UKR': 'Europe', 'RUS': 'Europe', 'TUR': 'Europe',
            'POR': 'Europe', 'ROU': 'Europe', 'SRB': 'Europe', 'CRO': 'Europe',
            # 亚洲
            'CHN': 'Asia', 'JPN': 'Asia', 'KOR': 'Asia', 'PRK': 'Asia',
            'IND': 'Asia', 'IRN': 'Asia', 'KAZ': 'Asia', 'THA': 'Asia',
            'IDN': 'Asia', 'MAS': 'Asia', 'SGP': 'Asia', 'VNM': 'Asia',
            'PHI': 'Asia', 'PAK': 'Asia', 'BAN': 'Asia', 'IRQ': 'Asia',
            # 大洋洲
            'AUS': 'Oceania', 'NZL': 'Oceania', 'FIJ': 'Oceania', 'PNG': 'Oceania',
            # 非洲
            'RSA': 'Africa', 'KEN': 'Africa', 'ETH': 'Africa', 'NGR': 'Africa',
            'EGY': 'Africa', 'MAR': 'Africa', 'ALG': 'Africa', 'TUN': 'Africa',
            'ZIM': 'Africa', 'UGA': 'Africa', 'GHA': 'Africa', 'CIV': 'Africa',
            # 历史国家映射
            'URS': 'Europe', 'GDR': 'Europe', 'FRG': 'Europe', 'TCH': 'Europe',
            'YUG': 'Europe', 'EUN': 'Europe', 'ROC': 'Europe', 'SGP': 'Asia',
        }
        return region_mapping

    def _analyze_regional_patterns(self, data: pd.DataFrame) -> Dict:
        """改进的区域分析，确保输出完整的区域数据"""
        region_patterns = {}
        regions = ['Europe', 'Asia', 'North America', 'South America', 'Africa', 'Oceania']
        
        # 确保区域映射
        data['Region'] = data['NOC'].map(self._create_region_mapping())
        data = data[data['Region'].isin(regions)]  # 只分析主要区域
        
        for region in regions:
            region_data = data[data['Region'] == region]
            if len(region_data) < 2:
                continue
                
            # 计算区域指标
            total_medals = region_data.groupby('Year')['Total'].sum()
            
            if len(total_medals) < 2:
                continue
                
            try:
                # 趋势计算
                years = np.array(total_medals.index).reshape(-1, 1)
                medals = total_medals.values
                trend = stats.theilslopes(medals, years)[0]
                
                # 近期表现
                recent_years = data['Year'].max() - 8
                recent_total = data[data['Year'] >= recent_years]['Total'].sum()
                region_recent = region_data[region_data['Year'] >= recent_years]['Total'].sum()
                recent_share = region_recent / recent_total if recent_total > 0 else 0
                
                # 主导国家
                top_countries = region_data.groupby('NOC')['Total'].sum().nlargest(3)
                
                region_patterns[region] = {
                    'trend': float(trend),
                    'volatility': float(total_medals.std() / (total_medals.mean() + 1e-6)),
                    'medal_median': float(total_medals.median()),
                    'recent_share': float(recent_share),
                    'top_countries': top_countries.index.tolist(),
                    'total_countries': len(region_data['NOC'].unique())
                }
            except Exception:
                continue
        
        return region_patterns
    def _calculate_robust_trend(self, series: pd.Series) -> float:
        """使用Theil-Sen回归计算稳健趋势"""
        if len(series) < 2:
            return 0.0
        years = series.index.values.reshape(-1, 1)
        medals = series.values
        slope, _, _, _ = stats.theilslopes(medals, years)
        return float(slope)
    def _calculate_region_trend(self, region_data: pd.DataFrame) -> float:
        """使用Huber回归计算区域趋势"""
        from sklearn.linear_model import HuberRegressor
        X = region_data[['Year']].values
        y = region_data['Total'].values
        model = HuberRegressor().fit(X, y)
        return float(model.coef_[0])
    def _analyze_sport_diversity(self, data: pd.DataFrame) -> Dict:
        """改进的运动项目多样性分析，增加错误检查"""
        try:
            diversity = {}
            
            # 数据验证
            if 'Sport' not in data.columns:
                raise ValueError("Sport列不存在于数据中")
                
            if 'NOC' not in data.columns:
                raise ValueError("NOC列不存在于数据中")
                
            # 打印整体统计信息
            print("\n运动项目多样性分析诊断:")
            print(f"总记录数: {len(data)}")
            print(f"唯一国家数: {data['NOC'].nunique()}")
            print(f"唯一运动项目数: {data['Sport'].nunique()}")
            
            # 计算每个国家在不同项目上的分布
            country_sports = data.groupby('NOC')['Sport'].nunique()
            print(f"\n国家运动项目分布:")
            print(f"最大值: {country_sports.max()}")
            print(f"最小值: {country_sports.min()}")
            print(f"平均值: {country_sports.mean():.2f}")
            print(f"中位数: {country_sports.median()}")
            
            # 计算多样性指标
            diversity['overall'] = {
                'avg_sports': float(country_sports.mean()),
                'max_sports': int(country_sports.max()),
                'min_sports': int(country_sports.min()),
                'median_sports': float(country_sports.median()),
                'total_sports': len(data['Sport'].unique())
            }
            
            # 识别专注型和多样化型国家
            q25, q75 = country_sports.quantile([0.25, 0.75])
            specialized = country_sports[country_sports < q25]
            diversified = country_sports[country_sports > q75]
            
            print(f"\n专注型国家(低于25分位): {len(specialized)}")
            print(f"多样化国家(高于75分位): {len(diversified)}")
            
            diversity['specialized'] = specialized.to_dict()
            diversity['diversified'] = diversified.to_dict()
            
            # 时间趋势分析
            if 'Year' in data.columns:
                recent_years = data['Year'].max() - 8
                recent_data = data[data['Year'] >= recent_years]
                recent_sports = recent_data.groupby('NOC')['Sport'].nunique()
                
                diversity['recent_trends'] = {
                    'avg_sports_recent': float(recent_sports.mean()),
                    'max_sports_recent': int(recent_sports.max()),
                    'countries_increased': len(recent_sports[recent_sports > country_sports])
                }
            
            return diversity
            
        except Exception as e:
            print(f"错误: 分析运动项目多样性时发生异常: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return {
                'overall': {'avg_sports': 0, 'max_sports': 0, 'min_sports': 0},
                'specialized': {},
                'diversified': {}
            }

    def generate_country_insights(self, medals_df: pd.DataFrame, athletes_df: pd.DataFrame) -> Dict[str, CountryInsight]:
        insights = {}
        
        medals_df = medals_df.copy()
        athletes_df = athletes_df.copy()

        # 预处理：清理NOC代码中的额外空格
        medals_df['NOC'] = medals_df['NOC'].str.strip()
        
        # NOC标准化映射
        noc_mapping = {
            'UNITED STATES': 'USA', 'GREAT BRITAIN': 'GBR', 'SOVIET UNION': 'URS',
            'FRANCE': 'FRA', 'CHINA': 'CHN', 'GERMANY': 'GER', 'ITALY': 'ITA',
            'AUSTRALIA': 'AUS', 'JAPAN': 'JPN', 'HUNGARY': 'HUN', 'SWEDEN': 'SWE',
            'RUSSIA': 'RUS', 'EAST GERMANY': 'GDR', 'NETHERLANDS': 'NED',
            'CANADA': 'CAN', 'SOUTH KOREA': 'KOR', 'ROMANIA': 'ROU', 'POLAND': 'POL',
            'FINLAND': 'FIN', 'CUBA': 'CUB', 'BULGARIA': 'BUL', 'SWITZERLAND': 'SUI',
            'WEST GERMANY': 'FRG', 'DENMARK': 'DEN', 'SPAIN': 'ESP', 'NORWAY': 'NOR',
            'BRAZIL': 'BRA', 'BELGIUM': 'BEL', 'NEW ZEALAND': 'NZL'
        }
        
        medals_df['NOC'] = medals_df['NOC'].apply(lambda x: noc_mapping.get(x.upper(), x))
        
        main_countries = medals_df.groupby('NOC')['Total'].sum().nlargest(30).index
        
        for noc in main_countries:
            trend_score = self._calculate_trend_score(medals_df, noc)
            stability_score = self._calculate_stability_score(medals_df, noc)
            diversity_score = self._calculate_diversity_score(athletes_df, noc)
            
            key_findings = self._generate_key_findings(
                medals_df, athletes_df, noc,
                trend_score, stability_score, diversity_score
            )
            
            recommendations = self._generate_recommendations(
                key_findings, trend_score, stability_score, diversity_score
            )
            
            insights[noc] = CountryInsight(
                noc=noc,
                trend_score=trend_score,
                stability_score=stability_score,
                diversity_score=diversity_score,
                key_findings=key_findings,
                recommendations=recommendations
            )
        
        return insights
    def _calculate_trend_score(self, data: pd.DataFrame, noc: str) -> float:
        """改进的趋势分析，使用Prophet模型捕捉非线性趋势"""
        try:
            from fbprophet import Prophet
        except ImportError:
            from prophet import Prophet
        
        country_data = data[data['NOC'] == noc].copy()
        if len(country_data) < 4:  # 需要足够的数据点
            return 0.0
        
        try:
            # 准备Prophet数据
            df = pd.DataFrame({
                'ds': pd.to_datetime(country_data['Year'].astype(str)),
                'y': country_data['Total']
            })
            
            # 拟合Prophet模型
            model = Prophet(
                yearly_seasonality=False,
                weekly_seasonality=False,
                daily_seasonality=False,
                interval_width=0.95
            )
            model.fit(df)
            
            # 预测未来趋势
            future = model.make_future_dataframe(periods=1, freq='Y')
            forecast = model.predict(future)
            
            # 计算趋势分数
            trend = forecast['trend'].diff().mean()
            
            # 标准化趋势分数
            max_trend = data.groupby('NOC')['Total'].mean().max()
            normalized_trend = (trend + max_trend) / (2 * max_trend)
            
            # 考虑预测不确定性
            uncertainty = (forecast['yhat_upper'] - forecast['yhat_lower']).mean()
            uncertainty_penalty = 1 / (1 + uncertainty)
            
            final_score = normalized_trend * uncertainty_penalty
            
            return max(0.0, min(1.0, final_score))
            
        except Exception as e:
            print(f"Warning: Prophet analysis failed for {noc}: {str(e)}")
            # 降级为简单线性回归
            return super()._calculate_trend_score(data, noc)
    # def _calculate_trend_score(self, data: pd.DataFrame, noc: str) -> float:
    #     """计算国家的发展趋势得分
    #     改进:
    #     1. 考虑长期和短期趋势
    #     2. 使用robust回归
    #     3. 确保得分在0-1范围内
    #     """
    #     country_data = data[data['NOC'] == noc].copy()
    #     if len(country_data) < 2:
    #         return 0.0
        
    #     # 准备数据
    #     country_data = country_data.sort_values('Year')
    #     years = (country_data['Year'] - country_data['Year'].min()).values
    #     medals = country_data['Total'].values
        
    #     try:
    #         # 使用RobustRegression避免异常值影响
    #         regression = stats.theilslopes(medals, years)
    #         trend = regression[0]  # 斜率
            
    #         # 计算近期趋势（最近3届）
    #         recent_data = country_data.tail(3)
    #         if len(recent_data) >= 2:
    #             recent_years = (recent_data['Year'] - recent_data['Year'].min()).values
    #             recent_medals = recent_data['Total'].values
    #             recent_trend = stats.theilslopes(recent_medals, recent_years)[0]
    #         else:
    #             recent_trend = trend
            
    #         # 标准化趋势分数
    #         all_trends = data.groupby('NOC').apply(
    #             lambda x: stats.theilslopes(x['Total'].values, 
    #                                     (x['Year'] - x['Year'].min()).values)[0]
    #             if len(x) >= 2 else 0
    #         )
            
    #         # 使用百分位数进行标准化，确保分数在0-1之间
    #         trend_percentile = stats.percentileofscore(all_trends, trend) / 100
    #         recent_percentile = stats.percentileofscore(all_trends, recent_trend) / 100
            
    #         # 综合长期和近期趋势，近期趋势权重更大
    #         final_score = trend_percentile * 0.4 + recent_percentile * 0.6
            
    #         return max(0.0, min(1.0, final_score))  # 确保在0-1范围内
            
    #     except Exception:
    #         return 0.0

    def _calculate_stability_score(self, data: pd.DataFrame, noc: str) -> float:
        """计算国家表现的稳定性得分"""
        country_data = data[data['NOC'] == noc]
        if len(country_data) < 2:
            return 0.0
            
        # 计算变异系数（标准差/平均值）
        cv = country_data['Total'].std() / (country_data['Total'].mean() + 1e-6)
        
        # 转换为稳定性得分（越稳定越接近1）
        return 1 / (1 + cv)

    def _calculate_diversity_score(self, data: pd.DataFrame, noc: str) -> float:
        """改进的多样性得分计算"""
        try:
            # 规范化NOC处理
            standardized_noc = noc.strip().upper()
            country_data = data[data['NOC'] == standardized_noc].copy()
            
            if len(country_data) == 0:
                print(f"警告: {noc} (标准化后: {standardized_noc}) 没有运动项目数据")
                # 尝试模糊匹配
                similar_nocs = data['NOC'].unique()
                print(f"数据中存在的相似NOC: {[n for n in similar_nocs if n.startswith(standardized_noc[:3])]}")
                return 0.0
                
            # 基本验证
            if 'Sport' not in country_data.columns:
                print(f"错误: Sport列不存在")
                return 0.0
                
            # 数据统计
            total_sports = len(data['Sport'].unique())
            country_sports = len(country_data['Sport'].unique())
            
            # 1. 规模得分 (0-0.4)
            scale_score = 0.4 * (country_sports / total_sports) if total_sports > 0 else 0
            
            # 2. 均衡度得分 (0-0.3)
            sport_counts = country_data['Sport'].value_counts()
            if len(sport_counts) > 1:
                probs = sport_counts / sport_counts.sum()
                entropy = -np.sum(probs * np.log2(probs + 1e-10))
                max_entropy = np.log2(len(sport_counts))
                balance_score = 0.3 * (entropy / max_entropy)
            else:
                balance_score = 0.0
            
            # 3. 参与度得分 (0-0.3)
            recent_years = data['Year'].max() - 8
            recent_data = country_data[country_data['Year'] >= recent_years]
            participation_rate = len(recent_data['Sport'].unique()) / max(country_sports, 1)
            participation_score = 0.3 * participation_rate
            
            final_score = scale_score + balance_score + participation_score
            
            return round(min(1.0, max(0.0, final_score)), 2)
            
        except Exception as e:
            print(f"错误: 计算{noc}的多样性得分时发生异常: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return 0.0

    def _generate_key_findings(self, 
                             medals_df: pd.DataFrame, 
                             athletes_df: pd.DataFrame, 
                             noc: str,
                             trend_score: float,
                             stability_score: float,
                             diversity_score: float) -> List[str]:
        """生成关键发现"""
        findings = []
        
        # 分析趋势
        if trend_score > 0.7:
            findings.append("显示出强劲的上升势头")
        elif trend_score < 0.3:
            findings.append("表现呈现下降趋势")
        
        # 分析稳定性
        if stability_score > 0.7:
            findings.append("表现非常稳定")
        elif stability_score < 0.3:
            findings.append("表现波动较大")
        
        # 分析多样性
        if diversity_score > 0.7:
            findings.append("具有良好的项目多样性")
        elif diversity_score < 0.3:
            findings.append("项目集中度较高")
        
        return findings

    def _generate_recommendations(self, findings: List[str],
                                trend_score: float,
                                stability_score: float,
                                diversity_score: float) -> List[str]:
        """生成建议，保持原有接口"""
        recommendations = []
        
        # 基于趋势分数生成建议
        if trend_score < 0.5:
            recommendations.append("建议增加青少年训练营投入，重点发展潜力项目")
        elif trend_score > 0.8:
            recommendations.append("可考虑在优势项目中建立长期统治地位")
        
        # 基于稳定性分数生成建议
        if stability_score < 0.5:
            recommendations.append("需建立运动员伤病管理系统和后备人才库")
        elif stability_score < 0.7:
            recommendations.append("加强运动员梯队建设，提高稳定性")
        
        # 基于多样性分数生成建议
        if diversity_score < 0.3:
            recommendations.append("应优先拓展与现有优势项目相关的新分项")
        elif 0.3 <= diversity_score < 0.6:
            recommendations.append("可尝试在新增奥运项目中寻找突破机会")
        elif diversity_score >= 0.6:
            recommendations.append("保持项目多样性优势，巩固竞争实力")
        
        return recommendations

    def generate_report(self, trends: Dict, insights: Dict[str, CountryInsight]) -> str:
        """生成分析报告"""
        report = []
        
        # 1. 总体趋势
        report.append("1. 奥运会奖牌总体趋势分析")
        report.append("-" * 50)
        
        # 添加整体趋势分析
        overall = trends.get('overall', {})
        report.append(f"\n参与国家数量: {overall.get('total_countries', 'N/A')}")
        
        # 添加奖牌集中度分析
        concentration = overall.get('medals_concentration', {})
        for period, stats in concentration.items():
            report.append(f"\n{period}时期:")
            report.append(f"  - 基尼系数: {stats['gini_coefficient']:.3f}")
            report.append(f"  - 前10国家占比: {stats['top_10_share']*100:.1f}%")
        
        # 2. 新兴与衰退趋势
        report.append("\n\n2. 新兴与衰退国家分析")
        report.append("-" * 50)
        
        # 新兴国家
        emerging = overall.get('emerging_countries', [])
        report.append("\n新兴奥运强国:")
        for country in emerging[:5]:
            report.append(
                f"  - {country['NOC']}: 增长率 {country['growth_rate']*100:.1f}%, "
                f"近期平均 {country['recent_medals']:.1f} 枚奖牌"
            )
        
        # 衰退国家
        declining = overall.get('declining_countries', [])
        report.append("\n实力下降国家:")
        for country in declining[:5]:
            report.append(
                f"  - {country['NOC']}: 下降率 {country['decline_rate']*100:.1f}%, "
                f"近期平均 {country['recent_medals']:.1f} 枚奖牌"
            )
        
        # 3. 区域分析
        report.append("\n\n3. 区域性分析")
        report.append("-" * 50)
        
        regional = trends.get('regional', {})
        for region, stats in regional.items():
            report.append(f"\n{region}:")
            report.append(f"  - 趋势系数: {stats['trend']:.2f}")
            report.append(f"  - 波动性: {stats['volatility']:.2f}")
            report.append(f"  - 奖牌中位数: {stats['medal_median']:.1f}")
            report.append(f"  - 近期份额: {stats['recent_share']*100:.1f}%")
        
        # 4. 国家深度分析
        report.append("\n\n4. 国家深度分析")
        report.append("-" * 50)
        
        for noc, insight in list(insights.items())[:10]:  # 展示前10个国家
            report.append(f"\n{noc}分析:")
            report.append(f"  趋势得分: {insight.trend_score:.2f}")
            report.append(f"  稳定性得分: {insight.stability_score:.2f}")
            report.append(f"  多样性得分: {insight.diversity_score:.2f}")
            report.append("  主要发现:")
            for finding in insight.key_findings:
                report.append(f"    - {finding}")
            report.append("  建议:")
            for recommendation in insight.recommendations:
                report.append(f"    - {recommendation}")
        
        return "\n".join(report)

def main():
    console = Console()
    
    try:
        # 初始化分析器
        analyzer = OlympicMedalInsightAnalyzer()
        
        # 加载数据
        console.print("[bold cyan]加载数据...[/bold cyan]")
        medals_df, athletes_df, programs_df = analyzer.load_and_prepare_data()
        
        # 分析趋势
        console.print("[bold cyan]分析奖牌趋势...[/bold cyan]")
        # 在 main() 函数中
        trends = analyzer.analyze_medal_trends(medals_df, athletes_df)
        
        # 生成国家洞察
        console.print("[bold cyan]生成国家洞察...[/bold cyan]")
        insights = analyzer.generate_country_insights(medals_df, athletes_df)
        
        # 生成报告
        report = analyzer.generate_report(trends, insights)
        
        # 保存报告
        output_dir = Path("analysis_results")
        output_dir.mkdir(exist_ok=True)
        
        with open(output_dir / "olympic_medal_insights_report.txt", "w", encoding='utf-8') as f:
            f.write(report)
        
        # 显示报告
        console.print("\n[bold green]分析报告:[/bold green]")
        console.print(report)
        
    except Exception as e:
        console.print(f"[bold red]错误: {str(e)}[/bold red]")
        import traceback
        console.print(traceback.format_exc())
        raise e

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from rich.console import Console
from rich.table import Table
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from typing import Dict, List, Tuple
import seaborn as sns
import matplotlib.pyplot as plt
from dataclasses import dataclass
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

@dataclass
class CountryInsight:
    noc: str
    trend_score: float
    stability_score: float
    diversity_score: float
    key_findings: List[str]
    recommendations: List[str]

class OlympicMedalInsightAnalyzer:
    def __init__(self):
        self.console = Console()
        self.insights = {}
        self.trends = {}
        self.patterns = {}
        
    def load_and_prepare_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """加载和准备分析所需的数据"""
        try:
            def try_load_data(file_path_base: str) -> pd.DataFrame:
                """尝试多种方式加载数据"""
                base_path = Path(file_path_base)
                data_dir = base_path.parent
                data_dir.mkdir(parents=True, exist_ok=True)
                
                attempts = [
                    (base_path.with_suffix('.parquet'), lambda x: pd.read_parquet(x)),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x)),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x, encoding='utf-8')),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x, encoding='latin1')),
                    (base_path.parent / f"{base_path.name}.parquet", lambda x: pd.read_parquet(x)),
                    (base_path.parent / f"{base_path.name}.csv", lambda x: pd.read_csv(x)),
                    (base_path.parent / f"{base_path.name}.xlsx", lambda x: pd.read_excel(x))
                ]
                
                errors = []
                for file_path, reader in attempts:
                    try:
                        if file_path.exists():
                            data = reader(file_path)
                            self.console.print(f"[green]成功从 {file_path} 加载数据[/green]")
                            return data
                    except Exception as e:
                        errors.append(f"{file_path}: {str(e)}")
                        continue
                
                available_files = list(data_dir.glob("*")) if data_dir.exists() else []
                files_str = "\n".join(f"- {f.name}" for f in available_files) if available_files else "目录为空"
                
                error_msg = (
                    f"无法加载数据文件 {base_path}.*\n"
                    f"尝试的路径:\n{chr(10).join(f'- {err}' for err in errors)}\n"
                    f"目录 {data_dir} 中的文件:\n{files_str}"
                )
                raise FileNotFoundError(error_msg)

            # 加载数据
            medals_df = try_load_data("data/processed/medal_counts")
            athletes_df = try_load_data("data/processed/athletes")
            programs_df = try_load_data("data/processed/programs")
            
            # 数据预处理
            for df in [medals_df, athletes_df, programs_df]:
                if 'Year' in df.columns:
                    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
                    
            # 移除重复项
            medals_df = medals_df.drop_duplicates()
            athletes_df = athletes_df.drop_duplicates()
            programs_df = programs_df.drop_duplicates()
            
            # 验证必要的列
            required_cols = {
                'medals_df': ['Year', 'NOC', 'Gold', 'Total'],
                'athletes_df': ['Year', 'NOC', 'Sport'],
                'programs_df': ['Sport', 'Discipline']
            }
            
            for df_name, df in [
                ('medals_df', medals_df), 
                ('athletes_df', athletes_df), 
                ('programs_df', programs_df)
            ]:
                missing_cols = [col for col in required_cols[df_name] if col not in df.columns]
                if missing_cols:
                    raise ValueError(f"{df_name} 缺少必要的列: {', '.join(missing_cols)}")
            
            return medals_df, athletes_df, programs_df
            
        except Exception as e:
            self.console.print(f"[bold red]数据加载错误: {str(e)}[/bold red]")
            raise

    def analyze_medal_trends(self, medals_df: pd.DataFrame, athletes_df: pd.DataFrame) -> Dict:
        """分析奖牌趋势和模式"""
        # 预处理：过滤历史国家
        historical_nocs = {
            'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC',
            'Soviet Union', 'East Germany', 'West Germany', 'Unified Team'
        }
        medals_df = medals_df[~medals_df['NOC'].isin(historical_nocs)].copy()
        athletes_df = athletes_df[~athletes_df['NOC'].isin(historical_nocs)].copy()
        trends = {}
        
        # 1. 整体趋势分析
        overall_trends = {
            'total_countries': medals_df['NOC'].nunique(),
            'medals_concentration': self._calculate_medals_concentration(medals_df),
            'emerging_countries': self._identify_emerging_countries(medals_df),
            'declining_countries': self._identify_declining_countries(medals_df)
        }
        trends['overall'] = overall_trends
        
        # 2. 区域性分析
        region_mapping = self._create_region_mapping()
        medals_df['Region'] = medals_df['NOC'].map(region_mapping)
        regional_trends = self._analyze_regional_patterns(medals_df)
        trends['regional'] = regional_trends
        
        # 3. 项目多样性分析 - 使用 athletes_df 而不是 medals_df
        diversity_trends = self._analyze_sport_diversity(athletes_df)
        trends['diversity'] = diversity_trends
        
        return trends

    def _calculate_medals_concentration(self, data: pd.DataFrame) -> Dict:
        """计算奖牌集中度趋势"""
        concentration = {}
        
        # 按时期分析
        periods = [(1896, 1950), (1951, 2000), (2001, 2024)]
        
        for start, end in periods:
            period_data = data[(data['Year'] >= start) & (data['Year'] <= end)]
            total_medals = period_data.groupby('NOC')['Total'].sum()
            
            # 计算基尼系数
            gini = self._calculate_gini(total_medals.values)
            
            # 计算前10国家占比
            top_10_share = total_medals.nlargest(10).sum() / total_medals.sum()
            
            concentration[f"{start}-{end}"] = {
                'gini_coefficient': gini,
                'top_10_share': top_10_share,
                'total_countries': len(total_medals)
            }
            
        return concentration

    def _calculate_gini(self, array: np.ndarray) -> float:
        """计算基尼系数"""
        array = array.flatten()
        if len(array) == 0:
            return 0
        array = np.sort(array)
        index = np.arange(1, len(array) + 1)
        n = len(array)
        return ((2 * index - n - 1) * array).sum() / (n * array.sum())
    def _identify_emerging_countries(self, data: pd.DataFrame, recent_years: int = 12) -> List[Dict]:
        """改进的新兴国家识别算法，使用复合指标"""
        historical_nocs = {'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC'}
        data = data[~data['NOC'].isin(historical_nocs)].copy()
        
        def calculate_growth_score(early_medals: float, recent_medals: float) -> float:
            """使用对数增长率避免除零问题"""
            if early_medals == 0:
                early_medals = 0.5  # Laplace平滑
            return np.log((recent_medals + 1) / (early_medals + 1))
        
        def calculate_momentum_score(country_data: pd.DataFrame) -> float:
            """计算动量得分"""
            if len(country_data) < 3:
                return 0
            
            # 使用指数加权移动平均
            weights = np.exp(np.linspace(-1, 0, len(country_data)))
            weighted_avg = np.average(country_data['Total'], weights=weights)
            return weighted_avg
        
        emerging_metrics = []
        max_year = data['Year'].max()
        
        for noc in data['NOC'].unique():
            country_data = data[data['NOC'] == noc]
            
            # 分割时期
            recent_data = country_data[country_data['Year'] >= (max_year - recent_years)]
            early_data = country_data[country_data['Year'] < (max_year - recent_years)]
            
            if len(recent_data) < 2 or len(early_data) < 2:
                continue
                
            # 计算复合指标
            growth_score = calculate_growth_score(
                early_data['Total'].mean(),
                recent_data['Total'].mean()
            )
            
            momentum_score = calculate_momentum_score(recent_data)
            
            # 计算稳定性
            stability = 1 / (1 + recent_data['Total'].std())
            
            # 综合得分
            final_score = (
                growth_score * 0.4 +
                momentum_score * 0.4 +
                stability * 0.2
            )
            
            if final_score > 0:
                emerging_metrics.append({
                    'NOC': noc,
                    'growth_rate': float(growth_score),
                    'momentum': float(momentum_score),
                    'stability': float(stability),
                    'final_score': float(final_score),
                    'recent_medals': float(recent_data['Total'].mean()),
                    'early_medals': float(early_data['Total'].mean())
                })
        
        # 按综合得分排序
        return sorted(emerging_metrics, key=lambda x: x['final_score'], reverse=True)[:10]
    # def _identify_emerging_countries(self, data: pd.DataFrame, recent_years: int = 12) -> List[Dict]:
    #     """改进的新兴国家识别算法"""
    #     # 数据预处理
    #     historical_nocs = {'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC'}
    #     data = data[~data['NOC'].isin(historical_nocs)].copy()
        
    #     # 计算时期表现
    #     max_year = data['Year'].max()
    #     recent_data = data[data['Year'] >= (max_year - recent_years)]
    #     early_data = data[data['Year'] < (max_year - recent_years)]
        
    #     # 计算平均值
    #     recent_avg = recent_data.groupby('NOC')['Total'].mean()
    #     early_avg = early_data.groupby('NOC')['Total'].mean()
        
    #     # 计算增长
    #     growth_df = pd.DataFrame({
    #         'recent': recent_avg,
    #         'early': early_avg
    #     }).fillna(0)
        
    #     # 改进的增长率计算
    #     growth_df['growth_rate'] = (
    #         (growth_df['recent'] - growth_df['early']) / 
    #         (growth_df['early'].replace(0, 0.1))  # 避免除以0
    #     ) * 100
        
    #     # 限制极端值
    #     growth_df['growth_rate'] = growth_df['growth_rate'].clip(-1000, 1000)
        
    #     # 筛选条件
    #     emerging = growth_df[
    #         (growth_df['growth_rate'] > 20) &  # 显著增长
    #         (growth_df['recent'] >= 3) &       # 当前有一定规模
    #         (growth_df['recent'] > growth_df['early'])  # 确实在增长
    #     ].sort_values('growth_rate', ascending=False)
        
    #     return [
    #         {
    #             'NOC': noc,
    #             'growth_rate': float(emerging.loc[noc, 'growth_rate']),
    #             'recent_medals': float(emerging.loc[noc, 'recent']),
    #             'early_medals': float(growth_df.loc[noc, 'early'])
    #         }
    #         for noc in emerging.index[:10]
    #     ]

    def _identify_declining_countries(self, data: pd.DataFrame, recent_years: int = 12) -> List[Dict]:
        """改进的衰退国家识别"""
        # 过滤历史国家
        historical_nocs = {'URS', 'GDR', 'FRG', 'EUN', 'YUG', 'TCH', 'ROC', 
                        'Soviet Union', 'East Germany', 'West Germany', 'Unified Team'}
        
        data = data[~data['NOC'].isin(historical_nocs)].copy()
        
        # 计算各个时期的表现
        recent_data = data[data['Year'] >= data['Year'].max() - recent_years]
        early_data = data[data['Year'] < data['Year'].max() - recent_years]
        
        # 使用更复杂的衰退指标
        decline_metrics = []
        
        for noc in data['NOC'].unique():
            noc_recent = recent_data[recent_data['NOC'] == noc]
            noc_early = early_data[early_data['NOC'] == noc]
            
            if len(noc_recent) < 2 or len(noc_early) < 2:
                continue
            
            # 计算多个维度的衰退指标
            medal_decline = (noc_early['Total'].mean() - noc_recent['Total'].mean()) / (noc_early['Total'].mean() + 1)
            consistency_decline = noc_recent['Total'].std() / (noc_recent['Total'].mean() + 1) - \
                                noc_early['Total'].std() / (noc_early['Total'].mean() + 1)
            peak_decline = noc_early['Total'].max() - noc_recent['Total'].max()
            
            # 仅考虑最近仍有参与的国家
            if noc_recent['Year'].max() < data['Year'].max() - 4:
                continue
                
            # 综合衰退分数
            decline_score = (medal_decline * 0.5 + 
                            consistency_decline * 0.3 + 
                            (peak_decline / (noc_early['Total'].max() + 1)) * 0.2)
            
            if decline_score > 0.2:  # 显著衰退阈值
                decline_metrics.append({
                    'NOC': noc,
                    'decline_rate': float(decline_score),
                    'recent_medals': float(noc_recent['Total'].mean()),
                    'early_medals': float(noc_early['Total'].mean()),
                    'peak_medals': float(noc_early['Total'].max()),
                    'last_appearance': int(noc_recent['Year'].max())
                })
        
        return sorted(decline_metrics, key=lambda x: x['decline_rate'], reverse=True)[:10]

    def _create_region_mapping(self) -> Dict[str, str]:
        """改进的区域映射，覆盖更多国家"""
        region_mapping = {
            # 北美洲
            'USA': 'North America', 'CAN': 'North America', 'MEX': 'North America',
            'PUR': 'North America', 'CUB': 'North America', 'JAM': 'North America',
            # 南美洲
            'BRA': 'South America', 'ARG': 'South America', 'CHI': 'South America',
            'COL': 'South America', 'VEN': 'South America', 'PER': 'South America',
            'ECU': 'South America', 'URU': 'South America', 'BOL': 'South America',
            # 欧洲
            'GBR': 'Europe', 'FRA': 'Europe', 'GER': 'Europe', 'ITA': 'Europe',
            'ESP': 'Europe', 'NED': 'Europe', 'SWE': 'Europe', 'NOR': 'Europe',
            'DEN': 'Europe', 'FIN': 'Europe', 'BEL': 'Europe', 'AUT': 'Europe',
            'SUI': 'Europe', 'POL': 'Europe', 'HUN': 'Europe', 'CZE': 'Europe',
            'GRE': 'Europe', 'UKR': 'Europe', 'RUS': 'Europe', 'TUR': 'Europe',
            'POR': 'Europe', 'ROU': 'Europe', 'SRB': 'Europe', 'CRO': 'Europe',
            # 亚洲
            'CHN': 'Asia', 'JPN': 'Asia', 'KOR': 'Asia', 'PRK': 'Asia',
            'IND': 'Asia', 'IRN': 'Asia', 'KAZ': 'Asia', 'THA': 'Asia',
            'IDN': 'Asia', 'MAS': 'Asia', 'SGP': 'Asia', 'VNM': 'Asia',
            'PHI': 'Asia', 'PAK': 'Asia', 'BAN': 'Asia', 'IRQ': 'Asia',
            # 大洋洲
            'AUS': 'Oceania', 'NZL': 'Oceania', 'FIJ': 'Oceania', 'PNG': 'Oceania',
            # 非洲
            'RSA': 'Africa', 'KEN': 'Africa', 'ETH': 'Africa', 'NGR': 'Africa',
            'EGY': 'Africa', 'MAR': 'Africa', 'ALG': 'Africa', 'TUN': 'Africa',
            'ZIM': 'Africa', 'UGA': 'Africa', 'GHA': 'Africa', 'CIV': 'Africa',
            # 历史国家映射
            'URS': 'Europe', 'GDR': 'Europe', 'FRG': 'Europe', 'TCH': 'Europe',
            'YUG': 'Europe', 'EUN': 'Europe', 'ROC': 'Europe', 'SGP': 'Asia',
        }
        return region_mapping

    def _analyze_regional_patterns(self, data: pd.DataFrame) -> Dict:
        """
        Enhanced regional analysis with comprehensive metrics
        """
        region_patterns = {}
        regions = ['Europe', 'Asia', 'North America', 'South America', 'Africa', 'Oceania']
        
        # Ensure region mapping
        data['Region'] = data['NOC'].map(self._create_region_mapping())
        data = data[data['Region'].isin(regions)]
        
        max_year = data['Year'].max()
        recent_cutoff = max_year - 12
        
        for region in regions:
            region_data = data[data['Region'] == region]
            if len(region_data) < 2:
                continue
                
            recent_data = region_data[region_data['Year'] > recent_cutoff]
            historical_data = region_data[region_data['Year'] <= recent_cutoff]
            
            try:
                # Calculate comprehensive metrics
                recent_medals = recent_data.groupby('Year')['Total'].sum().mean()
                historical_medals = historical_data.groupby('Year')['Total'].sum().mean()
                
                growth_rate = ((recent_medals - historical_medals) / 
                            (historical_medals + 1e-6))
                
                # Calculate region dominance
                total_recent = data[data['Year'] > recent_cutoff]['Total'].sum()
                region_recent = recent_data['Total'].sum()
                region_share = region_recent / total_recent if total_recent > 0 else 0
                
                # Top performing countries
                top_countries = recent_data.groupby('NOC')['Total'].sum().nlargest(3)
                
                # Diversity of strong performers
                countries_above_threshold = len(recent_data.groupby('NOC')['Total'].sum()[
                    lambda x: x > x.mean()
                ])
                
                region_patterns[region] = {
                    'trend': float(growth_rate),
                    'recent_medals_avg': float(recent_medals),
                    'historical_medals_avg': float(historical_medals),
                    'region_share': float(region_share),
                    'top_countries': top_countries.index.tolist(),
                    'strong_performers_count': countries_above_threshold,
                    'total_countries': len(region_data['NOC'].unique())
                }
                
            except Exception as e:
                print(f"Error analyzing {region}: {str(e)}")
                continue
        
        return region_patterns
    def _calculate_robust_trend(self, series: pd.Series) -> float:
        """使用Theil-Sen回归计算稳健趋势"""
        if len(series) < 2:
            return 0.0
        years = series.index.values.reshape(-1, 1)
        medals = series.values
        slope, _, _, _ = stats.theilslopes(medals, years)
        return float(slope)
    def _calculate_region_trend(self, region_data: pd.DataFrame) -> float:
        """使用Huber回归计算区域趋势"""
        from sklearn.linear_model import HuberRegressor
        X = region_data[['Year']].values
        y = region_data['Total'].values
        model = HuberRegressor().fit(X, y)
        return float(model.coef_[0])
    def _analyze_sport_diversity(self, data: pd.DataFrame) -> Dict:
        """改进的运动项目多样性分析，增加错误检查"""
        try:
            diversity = {}
            
            # 数据验证
            if 'Sport' not in data.columns:
                raise ValueError("Sport列不存在于数据中")
                
            if 'NOC' not in data.columns:
                raise ValueError("NOC列不存在于数据中")
                
            # 打印整体统计信息
            print("\n运动项目多样性分析诊断:")
            print(f"总记录数: {len(data)}")
            print(f"唯一国家数: {data['NOC'].nunique()}")
            print(f"唯一运动项目数: {data['Sport'].nunique()}")
            
            # 计算每个国家在不同项目上的分布
            country_sports = data.groupby('NOC')['Sport'].nunique()
            print(f"\n国家运动项目分布:")
            print(f"最大值: {country_sports.max()}")
            print(f"最小值: {country_sports.min()}")
            print(f"平均值: {country_sports.mean():.2f}")
            print(f"中位数: {country_sports.median()}")
            
            # 计算多样性指标
            diversity['overall'] = {
                'avg_sports': float(country_sports.mean()),
                'max_sports': int(country_sports.max()),
                'min_sports': int(country_sports.min()),
                'median_sports': float(country_sports.median()),
                'total_sports': len(data['Sport'].unique())
            }
            
            # 识别专注型和多样化型国家
            q25, q75 = country_sports.quantile([0.25, 0.75])
            specialized = country_sports[country_sports < q25]
            diversified = country_sports[country_sports > q75]
            
            print(f"\n专注型国家(低于25分位): {len(specialized)}")
            print(f"多样化国家(高于75分位): {len(diversified)}")
            
            diversity['specialized'] = specialized.to_dict()
            diversity['diversified'] = diversified.to_dict()
            
            # 时间趋势分析
            if 'Year' in data.columns:
                recent_years = data['Year'].max() - 8
                recent_data = data[data['Year'] >= recent_years]
                recent_sports = recent_data.groupby('NOC')['Sport'].nunique()
                
                diversity['recent_trends'] = {
                    'avg_sports_recent': float(recent_sports.mean()),
                    'max_sports_recent': int(recent_sports.max()),
                    'countries_increased': len(recent_sports[recent_sports > country_sports])
                }
            
            return diversity
            
        except Exception as e:
            print(f"错误: 分析运动项目多样性时发生异常: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return {
                'overall': {'avg_sports': 0, 'max_sports': 0, 'min_sports': 0},
                'specialized': {},
                'diversified': {}
            }

    def generate_country_insights(self, medals_df: pd.DataFrame, athletes_df: pd.DataFrame) -> Dict[str, CountryInsight]:
        insights = {}
        
        medals_df = medals_df.copy()
        athletes_df = athletes_df.copy()

        # 预处理：清理NOC代码中的额外空格
        medals_df['NOC'] = medals_df['NOC'].str.strip()
        
        # NOC标准化映射
        noc_mapping = {
            'UNITED STATES': 'USA', 'GREAT BRITAIN': 'GBR', 'SOVIET UNION': 'URS',
            'FRANCE': 'FRA', 'CHINA': 'CHN', 'GERMANY': 'GER', 'ITALY': 'ITA',
            'AUSTRALIA': 'AUS', 'JAPAN': 'JPN', 'HUNGARY': 'HUN', 'SWEDEN': 'SWE',
            'RUSSIA': 'RUS', 'EAST GERMANY': 'GDR', 'NETHERLANDS': 'NED',
            'CANADA': 'CAN', 'SOUTH KOREA': 'KOR', 'ROMANIA': 'ROU', 'POLAND': 'POL',
            'FINLAND': 'FIN', 'CUBA': 'CUB', 'BULGARIA': 'BUL', 'SWITZERLAND': 'SUI',
            'WEST GERMANY': 'FRG', 'DENMARK': 'DEN', 'SPAIN': 'ESP', 'NORWAY': 'NOR',
            'BRAZIL': 'BRA', 'BELGIUM': 'BEL', 'NEW ZEALAND': 'NZL'
        }
        
        medals_df['NOC'] = medals_df['NOC'].apply(lambda x: noc_mapping.get(x.upper(), x))
        
        main_countries = medals_df.groupby('NOC')['Total'].sum().nlargest(30).index
        
        for noc in main_countries:
            trend_score = self._calculate_trend_score(medals_df, noc)
            stability_score = self._calculate_stability_score(medals_df, noc)
            diversity_score = self._calculate_diversity_score(athletes_df, noc)
            
            key_findings = self._generate_key_findings(
                medals_df, athletes_df, noc,
                trend_score, stability_score, diversity_score
            )
            
            recommendations = self._generate_recommendations(
                key_findings, trend_score, stability_score, diversity_score
            )
            
            insights[noc] = CountryInsight(
                noc=noc,
                trend_score=trend_score,
                stability_score=stability_score,
                diversity_score=diversity_score,
                key_findings=key_findings,
                recommendations=recommendations
            )
        
        return insights
    def _calculate_trend_score(self, data: pd.DataFrame, noc: str) -> float:
        """
        Calculate trend score using weighted recent performance and robust regression
        """
        country_data = data[data['NOC'] == noc].copy()
        if len(country_data) < 4:
            return 0.0
            
        try:
            # Split data into recent and historical periods
            max_year = country_data['Year'].max()
            recent_cutoff = max_year - 12  # Last 3 Olympics
            
            recent_data = country_data[country_data['Year'] > recent_cutoff]
            historical_data = country_data[country_data['Year'] <= recent_cutoff]
            
            # Calculate trends for both periods using Theil-Sen estimator
            def calc_period_trend(df):
                if len(df) < 2:
                    return 0
                years = (df['Year'] - df['Year'].min()).values
                medals = df['Total'].values
                return stats.theilslopes(medals, years)[0]
                
            recent_trend = calc_period_trend(recent_data)
            historical_trend = calc_period_trend(historical_data)
            
            # Calculate relative performance
            recent_avg = recent_data['Total'].mean() if len(recent_data) > 0 else 0
            historical_avg = historical_data['Total'].mean() if len(historical_data) > 0 else 0
            relative_change = (recent_avg - historical_avg) / (historical_avg + 1)
            
            # Combine metrics with weights
            trend_score = (
                0.5 * (recent_trend / (abs(recent_trend) + 1)) +  # Recent trend (normalized)
                0.3 * (historical_trend / (abs(historical_trend) + 1)) +  # Historical trend
                0.2 * (relative_change)  # Overall improvement
            )
            
            # Normalize to 0-1 range
            return max(0.0, min(1.0, (trend_score + 1) / 2))
            
        except Exception as e:
            print(f"Error calculating trend score for {noc}: {str(e)}")
            return 0.0
    # def _calculate_trend_score(self, data: pd.DataFrame, noc: str) -> float:
    #     """计算国家的发展趋势得分
    #     改进:
    #     1. 考虑长期和短期趋势
    #     2. 使用robust回归
    #     3. 确保得分在0-1范围内
    #     """
    #     country_data = data[data['NOC'] == noc].copy()
    #     if len(country_data) < 2:
    #         return 0.0
        
    #     # 准备数据
    #     country_data = country_data.sort_values('Year')
    #     years = (country_data['Year'] - country_data['Year'].min()).values
    #     medals = country_data['Total'].values
        
    #     try:
    #         # 使用RobustRegression避免异常值影响
    #         regression = stats.theilslopes(medals, years)
    #         trend = regression[0]  # 斜率
            
    #         # 计算近期趋势（最近3届）
    #         recent_data = country_data.tail(3)
    #         if len(recent_data) >= 2:
    #             recent_years = (recent_data['Year'] - recent_data['Year'].min()).values
    #             recent_medals = recent_data['Total'].values
    #             recent_trend = stats.theilslopes(recent_medals, recent_years)[0]
    #         else:
    #             recent_trend = trend
            
    #         # 标准化趋势分数
    #         all_trends = data.groupby('NOC').apply(
    #             lambda x: stats.theilslopes(x['Total'].values, 
    #                                     (x['Year'] - x['Year'].min()).values)[0]
    #             if len(x) >= 2 else 0
    #         )
            
    #         # 使用百分位数进行标准化，确保分数在0-1之间
    #         trend_percentile = stats.percentileofscore(all_trends, trend) / 100
    #         recent_percentile = stats.percentileofscore(all_trends, recent_trend) / 100
            
    #         # 综合长期和近期趋势，近期趋势权重更大
    #         final_score = trend_percentile * 0.4 + recent_percentile * 0.6
            
    #         return max(0.0, min(1.0, final_score))  # 确保在0-1范围内
            
    #     except Exception:
    #         return 0.0

    def _calculate_stability_score(self, data: pd.DataFrame, noc: str) -> float:
        """计算国家表现的稳定性得分"""
        country_data = data[data['NOC'] == noc]
        if len(country_data) < 2:
            return 0.0
            
        # 计算变异系数（标准差/平均值）
        cv = country_data['Total'].std() / (country_data['Total'].mean() + 1e-6)
        
        # 转换为稳定性得分（越稳定越接近1）
        return 1 / (1 + cv)

    def _calculate_diversity_score(self, data: pd.DataFrame, noc: str) -> float:
        """改进的多样性得分计算"""
        try:
            # 规范化NOC处理
            standardized_noc = noc.strip().upper()
            country_data = data[data['NOC'] == standardized_noc].copy()
            
            if len(country_data) == 0:
                print(f"警告: {noc} (标准化后: {standardized_noc}) 没有运动项目数据")
                # 尝试模糊匹配
                similar_nocs = data['NOC'].unique()
                print(f"数据中存在的相似NOC: {[n for n in similar_nocs if n.startswith(standardized_noc[:3])]}")
                return 0.0
                
            # 基本验证
            if 'Sport' not in country_data.columns:
                print(f"错误: Sport列不存在")
                return 0.0
                
            # 数据统计
            total_sports = len(data['Sport'].unique())
            country_sports = len(country_data['Sport'].unique())
            
            # 1. 规模得分 (0-0.4)
            scale_score = 0.4 * (country_sports / total_sports) if total_sports > 0 else 0
            
            # 2. 均衡度得分 (0-0.3)
            sport_counts = country_data['Sport'].value_counts()
            if len(sport_counts) > 1:
                probs = sport_counts / sport_counts.sum()
                entropy = -np.sum(probs * np.log2(probs + 1e-10))
                max_entropy = np.log2(len(sport_counts))
                balance_score = 0.3 * (entropy / max_entropy)
            else:
                balance_score = 0.0
            
            # 3. 参与度得分 (0-0.3)
            recent_years = data['Year'].max() - 8
            recent_data = country_data[country_data['Year'] >= recent_years]
            participation_rate = len(recent_data['Sport'].unique()) / max(country_sports, 1)
            participation_score = 0.3 * participation_rate
            
            final_score = scale_score + balance_score + participation_score
            
            return round(min(1.0, max(0.0, final_score)), 2)
            
        except Exception as e:
            print(f"错误: 计算{noc}的多样性得分时发生异常: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return 0.0

    def _generate_key_findings(self, 
                             medals_df: pd.DataFrame, 
                             athletes_df: pd.DataFrame, 
                             noc: str,
                             trend_score: float,
                             stability_score: float,
                             diversity_score: float) -> List[str]:
        """生成关键发现"""
        findings = []
        
        # 分析趋势
        if trend_score > 0.7:
            findings.append("显示出强劲的上升势头")
        elif trend_score < 0.3:
            findings.append("表现呈现下降趋势")
        
        # 分析稳定性
        if stability_score > 0.7:
            findings.append("表现非常稳定")
        elif stability_score < 0.3:
            findings.append("表现波动较大")
        
        # 分析多样性
        if diversity_score > 0.7:
            findings.append("具有良好的项目多样性")
        elif diversity_score < 0.3:
            findings.append("项目集中度较高")
        
        return findings

    def _generate_recommendations(self, findings: List[str],
                                trend_score: float,
                                stability_score: float,
                                diversity_score: float) -> List[str]:
        """
        Generate targeted recommendations based on comprehensive analysis
        """
        recommendations = []
        
        # Trend-based recommendations
        if trend_score < 0.3:
            recommendations.append("建议加大重点项目投入，制定中长期人才培养计划")
        elif trend_score < 0.6:
            recommendations.append("保持现有优势项目投入，同时开拓新的潜力项目")
        else:
            recommendations.append("巩固优势项目领先地位，建立可持续的竞技体系")
        
        # Stability-based recommendations
        if stability_score < 0.4:
            recommendations.append("加强后备人才梯队建设，建立科学的选材体系")
        elif stability_score < 0.7:
            recommendations.append("优化训练体系，提高竞技水平稳定性")
        else:
            recommendations.append("完善人才储备机制，保持成绩稳定性")
        
        # Diversity-based recommendations
        if diversity_score < 0.3:
            recommendations.append("拓展优势项目相关新项目，培养复合型人才")
        elif diversity_score < 0.6:
            recommendations.append("在保持优势项目的同时，积极开发新的竞技项目")
        else:
            recommendations.append("维持项目多样性优势，优化资源分配策略")
        
        return recommendations

    def generate_report(self, trends: Dict, insights: Dict[str, CountryInsight]) -> str:
        """生成分析报告"""
        report = []
        
        # 1. 总体趋势
        report.append("1. 奥运会奖牌总体趋势分析")
        report.append("-" * 50)
        
        # 添加整体趋势分析
        overall = trends.get('overall', {})
        report.append(f"\n参与国家数量: {overall.get('total_countries', 'N/A')}")
        
        # 添加奖牌集中度分析
        concentration = overall.get('medals_concentration', {})
        for period, stats in concentration.items():
            report.append(f"\n{period}时期:")
            report.append(f"  - 基尼系数: {stats['gini_coefficient']:.3f}")
            report.append(f"  - 前10国家占比: {stats['top_10_share']*100:.1f}%")
        
        # 2. 新兴与衰退趋势
        report.append("\n\n2. 新兴与衰退国家分析")
        report.append("-" * 50)
        
        # 新兴国家
        emerging = overall.get('emerging_countries', [])
        report.append("\n新兴奥运强国:")
        for country in emerging[:5]:
            report.append(
                f"  - {country['NOC']}: 增长率 {country['growth_rate']*100:.1f}%, "
                f"近期平均 {country['recent_medals']:.1f} 枚奖牌"
            )
        
        # 衰退国家
        declining = overall.get('declining_countries', [])
        report.append("\n实力下降国家:")
        for country in declining[:5]:
            report.append(
                f"  - {country['NOC']}: 下降率 {country['decline_rate']*100:.1f}%, "
                f"近期平均 {country['recent_medals']:.1f} 枚奖牌"
            )
        
        # 3. 区域分析
        report.append("\n\n3. 区域性分析")
        report.append("-" * 50)
        
        regional = trends.get('regional', {})
        for region, stats in regional.items():
            report.append(f"\n{region}:")
            report.append(f"  - 趋势系数: {stats['trend']:.2f}")
            report.append(f"  - 波动性: {stats['volatility']:.2f}")
            report.append(f"  - 奖牌中位数: {stats['medal_median']:.1f}")
            report.append(f"  - 近期份额: {stats['recent_share']*100:.1f}%")
        
        # 4. 国家深度分析
        report.append("\n\n4. 国家深度分析")
        report.append("-" * 50)
        
        for noc, insight in list(insights.items())[:10]:  # 展示前10个国家
            report.append(f"\n{noc}分析:")
            report.append(f"  趋势得分: {insight.trend_score:.2f}")
            report.append(f"  稳定性得分: {insight.stability_score:.2f}")
            report.append(f"  多样性得分: {insight.diversity_score:.2f}")
            report.append("  主要发现:")
            for finding in insight.key_findings:
                report.append(f"    - {finding}")
            report.append("  建议:")
            for recommendation in insight.recommendations:
                report.append(f"    - {recommendation}")
        
        return "\n".join(report)

def main():
    console = Console()
    
    try:
        # 初始化分析器
        analyzer = OlympicMedalInsightAnalyzer()
        
        # 加载数据
        console.print("[bold cyan]加载数据...[/bold cyan]")
        medals_df, athletes_df, programs_df = analyzer.load_and_prepare_data()
        
        # 分析趋势
        console.print("[bold cyan]分析奖牌趋势...[/bold cyan]")
        # 在 main() 函数中
        trends = analyzer.analyze_medal_trends(medals_df, athletes_df)
        
        # 生成国家洞察
        console.print("[bold cyan]生成国家洞察...[/bold cyan]")
        insights = analyzer.generate_country_insights(medals_df, athletes_df)
        
        # 生成报告
        report = analyzer.generate_report(trends, insights)
        
        # 保存报告
        output_dir = Path("analysis_results")
        output_dir.mkdir(exist_ok=True)
        
        with open(output_dir / "olympic_medal_insights_report.txt", "w", encoding='utf-8') as f:
            f.write(report)
        
        # 显示报告
        console.print("\n[bold green]分析报告:[/bold green]")
        console.print(report)
        
    except Exception as e:
        console.print(f"[bold red]错误: {str(e)}[/bold red]")
        import traceback
        console.print(traceback.format_exc())
        raise e

if __name__ == "__main__":
    main()

In [None]:
    def _find_advanced_benchmark(self, data: pd.DataFrame, sport: str, target_country: str) -> Dict:
        """Improved benchmark country selection with dynamic window and decay"""

        def calculate_sport_strength(country_data: pd.DataFrame, window_years: int = 8) -> float:
            """Calculate sport strength with temporal weighting"""
            if len(country_data) < 3:
                return 0.0

            recent_data = country_data.sort_values('Year', ascending=False).head(window_years)
            if recent_data.empty:
                return 0.0

            # Exponential decay weights
            years = recent_data['Year'].values
            max_year = years.max()
            weights = np.exp(-0.2 * (max_year - years))

            # Weighted metrics
            weighted_medals = np.average(recent_data['Medal_Value'].values, weights=weights)
            weighted_trend = np.polyfit(range(len(recent_data)), recent_data['Medal_Value'].values, 1, w=weights)[0]

            return 0.7 * weighted_medals + 0.3 * weighted_trend

        def calculate_historical_fit(country_data: pd.DataFrame) -> float:
            """Calculate historical fit score with peak performance consideration"""
            if len(country_data) < 3:
                return 0.0

            years = country_data['Year'].values
            medals = country_data['Medal_Value'].values

            # Historical peak
            peak_medals = np.percentile(medals, 95)

            # Recent performance (last 3 cycles)
            recent_mask = years >= (years.max() - 12)
            recent_medals = medals[recent_mask] if any(recent_mask) else medals

            if len(recent_medals) == 0:
                return 0.0

            recent_avg = np.mean(recent_medals)

            # Consistency factor
            consistency = 1 / (1 + np.std(recent_medals))

            # Combined score
            historical_fit = (0.4 * (recent_avg / peak_medals) +
                              0.4 * consistency +
                              0.2 * (len(country_data) / 20))  # Experience factor

            return float(np.clip(historical_fit, 0, 1))

        try:
            # Dynamic analysis period based on sport
            sport_cycles = {
                'Swimming': 8,
                'Athletics': 12,
                'Gymnastics': 16,
                'Volleyball': 8
            }
            analysis_period = sport_cycles.get(sport, 12)

            # Country code normalization
            country_mapping = {
                'URS': 'RUS', 'GDR': 'GER', 'FRG': 'GER',
                'TCH': 'CZE', 'YUG': 'SRB'
            }
            normalized_country = country_mapping.get(target_country, target_country)

            # Get target country data
            target_data = data[
                (data['Sport'] == sport) &
                (data['Year'] >= data['Year'].max() - analysis_period) &
                (data['NOC'] == normalized_country)
                ]

            if target_data.empty:
                return None

            # Calculate target metrics
            target_strength = calculate_sport_strength(target_data)
            target_fit = calculate_historical_fit(target_data)

            # Calculate current performance with decay
            recent_performance = target_data.sort_values('Year', ascending=False)['Medal_Value'].head(3).mean()
            if recent_performance == 0:
                # Find last medal year and apply decay
                last_medal = data[
                    (data['Sport'] == sport) &
                    (data['NOC'] == normalized_country) &
                    (data['Medal_Value'] > 0)
                    ].sort_values('Year', ascending=False)

                if not last_medal.empty:
                    years_since = data['Year'].max() - last_medal['Year'].iloc[0]
                    decay_factor = 0.8 ** (years_since / 4)  # 20% decay per cycle
                    recent_performance = last_medal['Medal_Value'].iloc[0] * decay_factor

            # Find benchmark countries
            other_countries = []
            for country in data[data['NOC'] != normalized_country]['NOC'].unique():
                if country in country_mapping.keys():  # Skip historical countries
                    continue

                country_data = data[
                    (data['Sport'] == sport) &
                    (data['Year'] >= data['Year'].max() - analysis_period) &
                    (data['NOC'] == country)
                    ]

                if len(country_data) >= 3:
                    strength = calculate_sport_strength(country_data)
                    hist_fit = calculate_historical_fit(country_data)

                    if strength > target_strength:
                        performance = country_data.sort_values('Year', ascending=False)['Medal_Value'].head(3).mean()

                        # Monte Carlo simulation for uncertainty
                        volatility = {'Swimming': 0.15, 'Gymnastics': 0.25}.get(sport, 0.2)
                        simulations = np.random.normal(performance, performance * volatility, 1000)
                        ci = np.percentile(simulations, [5, 95])

                        other_countries.append({
                            'NOC': country,
                            'strength': strength,
                            'historical_fit': hist_fit,
                            'performance': performance,
                            'uncertainty': (ci[0], ci[1]),
                            'score': 0.4 * strength + 0.3 * hist_fit + 0.3 * (performance / (target_strength + 1e-6))
                        })

            if not other_countries:
                return None

            # Select best benchmark
            best_benchmark = max(other_countries, key=lambda x: x['score'])

            # Calculate normalized improvement potential
            sport_max = data.groupby('Sport')['Medal_Value'].max()[sport]
            improvement_potential = (best_benchmark['performance'] - recent_performance) / (sport_max + 1e-6)

            return {
                'sport': sport,
                'benchmark_country': str(best_benchmark['NOC']),
                'current_performance': float(recent_performance),
                'improvement_potential': float(np.clip(improvement_potential, 0, 1)),
                'historical_fit': float(target_fit),
                'estimated_medal_gain': {
                    'mean': float(best_benchmark['performance'] - recent_performance),
                    'range': f"{best_benchmark['uncertainty'][0]:.1f}-{best_benchmark['uncertainty'][1]:.1f}"
                },
                'benchmark_metrics': {
                    'avg_performance': float(best_benchmark['performance']),
                    'stability': float(best_benchmark['historical_fit']),
                    'experience_years': int(len(target_data))
                }
            }

        except Exception as e:
            print(f"Benchmark analysis error: {str(e)}")
            return None

In [None]:
    def recommend_coach_investments(self, athletes_df: pd.DataFrame, medals_df: pd.DataFrame,
                                    countries: List[str]) -> Dict:
        """增强版教练投资建议系统"""

        def analyze_historical_data(data: pd.DataFrame, country: str, sport: str) -> Dict:
            """分析历史数据并计算关键指标"""
            print(f"\n分析 {country} 在 {sport} 项目的历史数据:")

            # 检查原始数据
            print(f"数据集中unique的Sport值: {data['Sport'].unique()}")
            print(f"数据集中unique的NOC值: {data['NOC'].unique()}")

            # 放宽匹配条件，使用模糊匹配
            sport_mask = data['Sport'].str.contains(sport, case=False, na=False)
            country_mask = data['NOC'].str.contains(country, case=False, na=False)

            country_data = data[sport_mask & country_mask].copy()

            print(f"找到 {len(country_data)} 条原始数据记录")
            if len(country_data) == 0:
                print(f"尝试更广泛的搜索...")
                # 尝试查找可能的运动项目名称变体
                possible_sports = data[data['Sport'].str.contains(sport[:4], case=False, na=False)]['Sport'].unique()
                print(f"可能的运动项目: {possible_sports}")

                # 尝试查找可能的国家代码变体
                possible_countries = data[data['NOC'].str.contains(country[:2], case=False, na=False)]['NOC'].unique()
                print(f"可能的国家代码: {possible_countries}")

                # 使用更宽松的匹配
                sport_mask = data['Sport'].isin(possible_sports)
                country_mask = data['NOC'].isin(possible_countries)
                country_data = data[sport_mask & country_mask].copy()
                print(f"宽松匹配后找到 {len(country_data)} 条记录")

            if len(country_data) == 0:
                print(f"警告: {country} 在 {sport} 项目中没有数据")
                return create_empty_metrics()

            # 验证数据的完整性
            print("\n数据验证:")
            print(f"年份范围: {country_data['Year'].min()} - {country_data['Year'].max()}")
            print(f"Medal_Value统计: \n{country_data['Medal_Value'].describe()}")

            # 计算年度统计
            yearly_medals = country_data.groupby('Year')['Medal_Value'].sum()
            print(f"\n年度奖牌统计:\n{yearly_medals}")

            # 处理异常值
            q1, q3 = yearly_medals.quantile([0.25, 0.75])
            iqr = q3 - q1
            upper_bound = q3 + 1.5 * iqr
            yearly_medals = yearly_medals.clip(upper=upper_bound)

            # 计算近期表现 (最近8年)
            recent_years = yearly_medals.index >= yearly_medals.index.max() - 8
            recent_performance = yearly_medals[recent_years].mean() if any(recent_years) else 0
            print(f"近期平均表现: {recent_performance}")

            # 计算趋势
            if len(yearly_medals) >= 2:
                years = np.array(range(len(yearly_medals)))
                trend = np.polyfit(years, yearly_medals.values, 1)[0]
            else:
                trend = 0
            print(f"趋势系数: {trend}")

            # 计算历史适配度指标
            total_medals = yearly_medals.sum()
            n_years = len(yearly_medals)
            span_years = yearly_medals.index.max() - yearly_medals.index.min() + 4
            participation_rate = n_years * 4 / span_years if span_years > 0 else 0

            # 计算稳定性
            consistency = 1 / (1 + yearly_medals.std()) if len(yearly_medals) > 1 else 0

            print(f"""
        详细统计:
        - 总奖牌数: {total_medals}
        - 参赛年数: {n_years}
        - 参与率: {participation_rate:.2f}
        - 稳定性: {consistency:.2f}
        """)

            # 综合历史适配度
            historical_fit = (
                    0.4 * (total_medals / (n_years + 1)) / 10 +  # 归一化平均奖牌数
                    0.3 * participation_rate +
                    0.3 * consistency
            )

            metrics = {
                'peak': float(yearly_medals.max()),
                'recent': float(recent_performance),
                'trend': float(trend),
                'years': int(n_years),
                'historical_fit': float(historical_fit),
                'consistency': float(consistency),
                'total_medals': int(total_medals)
            }

            print(f"最终指标:\n{metrics}")
            return metrics

        def create_empty_metrics():
            """创建空指标"""
            return {
                'peak': 0,
                'recent': 0,
                'trend': 0,
                'years': 0,
                'historical_fit': 0,
                'consistency': 0,
                'total_medals': 0
            }

        def evaluate_improvement_potential(target: Dict, benchmark: Dict) -> float:
            """评估改进潜力"""
            print(f"\n评估改进潜力:")
            print(f"目标指标: {target}")
            print(f"标杆指标: {benchmark}")

            # 如果是新项目或历史表现较弱
            if target['total_medals'] < 3:
                base_potential = 0.3
                print(f"新项目或历史较弱，基础潜力: {base_potential}")
                return base_potential

            # 计算相对差距
            relative_gap = (benchmark['recent'] - target['recent']) / (benchmark['recent'] + 1)
            print(f"相对差距: {relative_gap}")

            # 趋势因子
            trend_factor = np.tanh(max(0, benchmark['trend'] - target['trend']))
            print(f"趋势因子: {trend_factor}")

            # 历史因子
            historical_factor = benchmark['historical_fit'] / (target['historical_fit'] + 0.1)
            print(f"历史因子: {historical_factor}")

            # 综合评分
            potential = (
                    0.5 * relative_gap +
                    0.3 * trend_factor +
                    0.2 * min(historical_factor, 2)  # 限制历史因子的影响
            )

            final_potential = float(np.clip(potential, 0, 1))
            print(f"最终潜力评分: {final_potential}")
            return final_potential

        try:
            recommendations = {}
            print(f"\n开始生成教练投资建议...")

            # 基础运动项目
            sports = ['Swimming', 'Athletics', 'Gymnastics', 'Volleyball']

            # 合理的最大奖牌预期
            max_medals = {
                'Swimming': 15,
                'Athletics': 12,
                'Gymnastics': 10,
                'Volleyball': 8
            }

            for country in countries:
                print(f"\n分析 {country} 的投资机会:")
                country_recommendations = []

                for sport in sports:
                    print(f"\n评估 {sport} 项目:")
                    try:
                        # 获取目标国家数据
                        target_metrics = analyze_historical_data(athletes_df, country, sport)

                        # 获取标杆国家数据
                        benchmark_data = athletes_df[
                            (athletes_df['Sport'] == sport) &
                            (athletes_df['NOC'] != country) &
                            (~athletes_df['NOC'].isin(['URS', 'GDR', 'FRG', 'TCH', 'YUG']))
                            ]

                        # 选择最佳表现的国家作为标杆
                        benchmark_country = benchmark_data.groupby('NOC')['Medal_Value'].sum().nlargest(1).index[0]
                        print(f"选择的标杆国家: {benchmark_country}")

                        benchmark_metrics = analyze_historical_data(athletes_df, benchmark_country, sport)

                        # 计算潜力
                        potential = evaluate_improvement_potential(target_metrics, benchmark_metrics)

                        # 计算预期奖牌增长
                        medal_gain = min(
                            max_medals[sport],
                            benchmark_metrics['recent'] - target_metrics['recent']
                        )

                        if potential >= 0.1:  # 降低潜力阈值
                            country_recommendations.append({
                                'sport': sport,
                                'benchmark_country': benchmark_country,
                                'current_performance': float(target_metrics['recent']),
                                'improvement_potential': float(potential),
                                'historical_fit': float(target_metrics['historical_fit']),
                                'estimated_medal_gain': {
                                    'mean': float(medal_gain),
                                    'range': f"{(medal_gain * 0.85):.1f}-{(medal_gain * 1.15):.1f}"
                                },
                                'benchmark_metrics': {
                                    'avg_performance': float(benchmark_metrics['recent']),
                                    'trend': float(benchmark_metrics['trend']),
                                    'experience_years': int(benchmark_metrics['years'])
                                }
                            })

                    except Exception as e:
                        print(f"处理 {sport} 时出错: {str(e)}")
                        continue

                recommendations[country] = sorted(
                    country_recommendations,
                    key=lambda x: x['improvement_potential'],
                    reverse=True
                )

            return recommendations

import pandas as pd
import numpy as np
from pathlib import Path
from rich.console import Console
from rich.table import Table
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from typing import Dict, List, Tuple
import seaborn as sns
import matplotlib.pyplot as plt
from dataclasses import dataclass
from scipy import stats

@dataclass
class CoachImpact:
    sport: str
    country: str
    period: Tuple[int, int]
    medal_change: float
    significance: float
    consistency: float

class CoachImpactAnalyzer:
    def __init__(self):
        self.console = Console()
        self.coach_effects = {}
        self.country_recommendations = {}

    def load_and_prepare_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """加载和准备分析所需的数据"""
        try:
            def try_load_data(file_path_base):
                """尝试多种方式加载数据"""
                # 确保基础路径是Path对象
                base_path = Path(file_path_base)
                data_dir = base_path.parent
                
                # 如果目录不存在，尝试创建
                data_dir.mkdir(parents=True, exist_ok=True)
                
                # 定义加载尝试列表
                attempts = [
                    (base_path.with_suffix('.parquet'), lambda x: pd.read_parquet(x)),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x)),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x, encoding='utf-8')),
                    (base_path.with_suffix('.csv'), lambda x: pd.read_csv(x, encoding='latin1')),
                    (base_path.parent / f"{base_path.name}.parquet", lambda x: pd.read_parquet(x)),
                    (base_path.parent / f"{base_path.name}.csv", lambda x: pd.read_csv(x))
                ]
                
                errors = []
                for file_path, reader in attempts:
                    try:
                        if file_path.exists():
                            data = reader(file_path)
                            self.console.print(f"[green]成功从 {file_path} 加载数据[/green]")
                            return data
                    except Exception as e:
                        errors.append(f"{file_path}: {str(e)}")
                        continue
                
                # 检查目录内容
                available_files = list(data_dir.glob("*")) if data_dir.exists() else []
                files_str = "\n".join(f"- {f.name}" for f in available_files) if available_files else "目录为空"
                
                error_msg = (
                    f"无法加载数据文件 {base_path}.*\n"
                    f"尝试的路径:\n{chr(10).join(f'- {err}' for err in errors)}\n"
                    f"目录 {data_dir} 中的文件:\n{files_str}"
                )
                raise FileNotFoundError(error_msg)
            
            # 加载数据
            athletes_df = try_load_data("data/processed/athletes")
            medals_df = try_load_data("data/processed/medal_counts")
            
            # 数据预处理
            athletes_df['Year'] = pd.to_numeric(athletes_df['Year'], errors='coerce')
            athletes_df = athletes_df.dropna(subset=['Year', 'NOC', 'Sport'])
            
            # 转换Medal列为数值
            medal_map = {'Gold': 3, 'Silver': 2, 'Bronze': 1}
            athletes_df['Medal_Value'] = athletes_df['Medal'].map(medal_map).fillna(0)
            
            # 数据验证
            required_cols = {'athletes': ['Year', 'NOC', 'Sport', 'Medal'],
                            'medals': ['Year', 'NOC', 'Gold', 'Total']}
                            
            for df_name, df in [('athletes', athletes_df), ('medals', medals_df)]:
                missing_cols = [col for col in required_cols[df_name] if col not in df.columns]
                if missing_cols:
                    self.console.print(f"[yellow]警告: {df_name} 数据集的列: {', '.join(df.columns)}[/yellow]")
                    raise ValueError(f"{df_name} 数据集缺少必要的列: {', '.join(missing_cols)}")
            
            return athletes_df, medals_df
            
        except Exception as e:
            self.console.print(f"[bold red]数据加载错误: {str(e)}[/bold red]")
            raise

    def detect_coach_effect_periods(self, data: pd.DataFrame, country: str, sport: str) -> List[CoachImpact]:
        """优化后的教练效应检测,增强数值稳定性和错误处理"""
        impacts = []
        min_medals = 3
        min_gap = 12
        window_size = 12

        try:
            # 输入验证
            if data.empty or not isinstance(country, str) or not isinstance(sport, str):
                return impacts

            # 规范化国家代码
            country_map = {'URS': 'RUS', 'GDR': 'GER', 'FRG': 'GER', 'TCH': 'CZE'}
            normalized_country = country_map.get(country, country)

            # 数据预处理和验证
            country_data = data[
                (data['NOC'] == normalized_country) &
                (data['Sport'] == sport)
                ].copy()  # 创建副本避免警告

            if country_data.empty:
                return impacts

            # 年度表现聚合与验证
            yearly_perf = country_data.groupby(['Year', 'Team'], observed=True).agg({
                'Medal_Value': ['sum', 'count']
            }).groupby('Year').agg({
                ('Medal_Value', 'sum'): 'sum',
                ('Medal_Value', 'count'): 'sum'
            }).reset_index()

            # 数据验证
            if yearly_perf.empty or yearly_perf[('Medal_Value', 'sum')].max() < min_medals:
                return impacts

            # 异常值处理
            medal_values = yearly_perf[('Medal_Value', 'sum')].values
            q1, q3 = np.percentile(medal_values, [25, 75])
            iqr = q3 - q1
            outlier_threshold = q3 + 1.5 * iqr
            medal_values = np.clip(medal_values, 0, outlier_threshold)
            yearly_perf[('Medal_Value', 'sum')] = medal_values

            # 窗口分析
            for i in range(len(yearly_perf) - window_size + 1):
                try:
                    window = yearly_perf.iloc[i:i + window_size]
                    p1, p2 = window.iloc[:window_size // 2], window.iloc[window_size // 2:]

                    # 数值计算保护
                    medal_diff = float(p2[('Medal_Value', 'sum')].mean() - p1[('Medal_Value', 'sum')].mean())
                    participation_ratio = max(1e-10, p1[('Medal_Value', 'count')].mean())
                    participation_change = float(p2[('Medal_Value', 'count')].mean() / participation_ratio - 1)

                    period = (int(window['Year'].iloc[0]), int(window['Year'].iloc[-1]))

                    # 验证时期重叠
                    if abs(medal_diff) >= min_medals and not any(
                            abs(period[0] - p[0]) < min_gap or abs(period[1] - p[1]) < min_gap
                            for p in [(imp.period[0], imp.period[1]) for imp in impacts]
                    ):
                        # 统计显著性测试
                        p_value = self._bootstrap_significance_test(
                            p1[('Medal_Value', 'sum')].values,
                            p2[('Medal_Value', 'sum')].values,
                            n_bootstrap=3000
                        )

                        # 一致性评估
                        consistency = self._evaluate_performance_consistency(window)

                        if p_value < 0.05 and consistency > 0.4:
                            impacts.append(CoachImpact(
                                sport=sport,
                                country=normalized_country,
                                period=period,
                                medal_change=medal_diff,
                                significance=float(1 - p_value),
                                consistency=consistency
                            ))

                except Exception as e:
                    self._detect_numerical_issues(f"Window {i}", window[('Medal_Value', 'sum')].values)
                    continue

            return sorted(impacts, key=lambda x: abs(x.medal_change), reverse=True)

        except Exception as e:
            print(f"教练效应检测错误: {str(e)}")
            return impacts
    def _calculate_stability_score(self, period1: pd.DataFrame, period2: pd.DataFrame) -> float:
        """计算表现稳定性得分"""
        std1 = period1['sum'].std()
        std2 = period2['sum'].std()
        mean1 = period1['sum'].mean()
        mean2 = period2['sum'].mean()
        
        cv1 = std1 / (mean1 + 1)  # 变异系数
        cv2 = std2 / (mean2 + 1)
        
        stability_improvement = (1 / (1 + cv2)) - (1 / (1 + cv1))
        return np.clip(stability_improvement, -1, 1)
    def _bootstrap_significance_test(self, period1: np.ndarray, period2: np.ndarray, n_bootstrap: int = 3000) -> float:
        """使用Bootstrap方法进行显著性检验,增加数值稳定性"""
        if len(period1) < 2 or len(period2) < 2:
            return 1.0
            
        try:
            # 数据预处理
            period1 = np.clip(period1, 0, np.percentile(period1, 99))
            period2 = np.clip(period2, 0, np.percentile(period2, 99))
            
            observed_diff = np.mean(period2) - np.mean(period1)
            combined = np.concatenate([period1, period2])
            n1, n2 = len(period1), len(period2)
            
            # Bootstrap重采样
            bootstrap_diffs = []
            for _ in range(n_bootstrap):
                resampled = np.random.choice(combined, size=len(combined), replace=True)
                diff = np.mean(resampled[n1:]) - np.mean(resampled[:n1])
                bootstrap_diffs.append(diff)
            
            # 计算p值
            p_value = np.mean(np.abs(bootstrap_diffs) >= np.abs(observed_diff))
            return float(np.clip(p_value, 1e-8, 1.0))
        except:
            return 1.0

    def _evaluate_performance_consistency(self, data: pd.DataFrame) -> float:
        """增强版表现一致性评估"""
        try:
            # 数据提取和验证
            medals = data[('Medal_Value', 'sum')].values.astype(np.float64)
            if len(medals) < 3:
                return 0.0

            # 异常值处理
            q1, q3 = np.percentile(medals, [25, 75])
            iqr = q3 - q1
            medals_cleaned = np.clip(medals, q1 - 1.5 * iqr, q3 + 1.5 * iqr)

            # 基础统计计算
            medals_mean = np.mean(medals_cleaned)
            medals_std = np.std(medals_cleaned)

            # 数值稳定性检查
            if medals_std < 1e-10 or np.isclose(medals_mean, 0, atol=1e-10):
                return 0.0

            # 标准化处理
            medals_scaled = (medals_cleaned - medals_mean) / (medals_std + 1e-10)
            years = np.arange(len(medals_scaled), dtype=np.float64)

            try:
                # 主要方法：SVD分解
                X = np.vstack([years, np.ones_like(years)]).T
                U, s, Vh = np.linalg.svd(X, full_matrices=False)

                # 条件数检查
                if np.max(s) / np.min(s) > 1e10:  # 病态矩阵检查
                    raise np.linalg.LinAlgError("Matrix is ill-conditioned")

                coef = Vh.T @ np.diag(1 / s) @ U.T @ medals_scaled
                trend = coef[0] * years + coef[1]

            except np.linalg.LinAlgError:
                # 备选方法：稳健回归
                slope, intercept = stats.theilslopes(medals_scaled, years)[:2]
                trend = slope * years + intercept

            # 残差分析
            residuals = medals_scaled - trend
            residuals_std = np.std(residuals)

            # 一致性得分计算
            base_consistency = 1.0 / (1.0 + residuals_std)

            # 趋势权重
            trend_weight = np.abs(np.corrcoef(years, medals_scaled)[0, 1])

            # 最终一致性得分
            final_consistency = 0.7 * base_consistency + 0.3 * trend_weight

            return float(np.clip(final_consistency, 0, 1))

        except Exception as e:
            print(f"一致性评估错误: {str(e)}")
            return 0.0

    def _find_advanced_benchmark(self, data: pd.DataFrame, sport: str, target_country: str) -> Dict:
        """Improved benchmark country selection with dynamic window and decay"""

        def calculate_sport_strength(country_data: pd.DataFrame, window_years: int = 8) -> float:
            """Calculate sport strength with temporal weighting"""
            if len(country_data) < 3:
                return 0.0

            recent_data = country_data.sort_values('Year', ascending=False).head(window_years)
            if recent_data.empty:
                return 0.0

            # Exponential decay weights
            years = recent_data['Year'].values
            max_year = years.max()
            weights = np.exp(-0.2 * (max_year - years))

            # Weighted metrics
            weighted_medals = np.average(recent_data['Medal_Value'].values, weights=weights)
            weighted_trend = np.polyfit(range(len(recent_data)), recent_data['Medal_Value'].values, 1, w=weights)[0]

            return 0.7 * weighted_medals + 0.3 * weighted_trend

        def calculate_historical_fit(country_data: pd.DataFrame) -> float:
            """Calculate historical fit score with peak performance consideration"""
            if len(country_data) < 3:
                return 0.0

            years = country_data['Year'].values
            medals = country_data['Medal_Value'].values

            # Historical peak
            peak_medals = np.percentile(medals, 95)

            # Recent performance (last 3 cycles)
            recent_mask = years >= (years.max() - 12)
            recent_medals = medals[recent_mask] if any(recent_mask) else medals

            if len(recent_medals) == 0:
                return 0.0

            recent_avg = np.mean(recent_medals)

            # Consistency factor
            consistency = 1 / (1 + np.std(recent_medals))

            # Combined score
            historical_fit = (0.4 * (recent_avg / peak_medals) +
                              0.4 * consistency +
                              0.2 * (len(country_data) / 20))  # Experience factor

            return float(np.clip(historical_fit, 0, 1))

        try:
            # Dynamic analysis period based on sport
            sport_cycles = {
                'Swimming': 8,
                'Athletics': 12,
                'Gymnastics': 16,
                'Volleyball': 8
            }
            analysis_period = sport_cycles.get(sport, 12)

            # Country code normalization
            country_mapping = {
                'URS': 'RUS', 'GDR': 'GER', 'FRG': 'GER',
                'TCH': 'CZE', 'YUG': 'SRB'
            }
            normalized_country = country_mapping.get(target_country, target_country)

            # Get target country data
            target_data = data[
                (data['Sport'] == sport) &
                (data['Year'] >= data['Year'].max() - analysis_period) &
                (data['NOC'] == normalized_country)
                ]

            if target_data.empty:
                return None

            # Calculate target metrics
            target_strength = calculate_sport_strength(target_data)
            target_fit = calculate_historical_fit(target_data)

            # Calculate current performance with decay
            recent_performance = target_data.sort_values('Year', ascending=False)['Medal_Value'].head(3).mean()
            if recent_performance == 0:
                # Find last medal year and apply decay
                last_medal = data[
                    (data['Sport'] == sport) &
                    (data['NOC'] == normalized_country) &
                    (data['Medal_Value'] > 0)
                    ].sort_values('Year', ascending=False)

                if not last_medal.empty:
                    years_since = data['Year'].max() - last_medal['Year'].iloc[0]
                    decay_factor = 0.8 ** (years_since / 4)  # 20% decay per cycle
                    recent_performance = last_medal['Medal_Value'].iloc[0] * decay_factor

            # Find benchmark countries
            other_countries = []
            for country in data[data['NOC'] != normalized_country]['NOC'].unique():
                if country in country_mapping.keys():  # Skip historical countries
                    continue

                country_data = data[
                    (data['Sport'] == sport) &
                    (data['Year'] >= data['Year'].max() - analysis_period) &
                    (data['NOC'] == country)
                    ]

                if len(country_data) >= 3:
                    strength = calculate_sport_strength(country_data)
                    hist_fit = calculate_historical_fit(country_data)

                    if strength > target_strength:
                        performance = country_data.sort_values('Year', ascending=False)['Medal_Value'].head(3).mean()

                        # Monte Carlo simulation for uncertainty
                        volatility = {'Swimming': 0.15, 'Gymnastics': 0.25}.get(sport, 0.2)
                        simulations = np.random.normal(performance, performance * volatility, 1000)
                        ci = np.percentile(simulations, [5, 95])

                        other_countries.append({
                            'NOC': country,
                            'strength': strength,
                            'historical_fit': hist_fit,
                            'performance': performance,
                            'uncertainty': (ci[0], ci[1]),
                            'score': 0.4 * strength + 0.3 * hist_fit + 0.3 * (performance / (target_strength + 1e-6))
                        })

            if not other_countries:
                return None

            # Select best benchmark
            best_benchmark = max(other_countries, key=lambda x: x['score'])

            # Calculate normalized improvement potential
            sport_max = data.groupby('Sport')['Medal_Value'].max()[sport]
            improvement_potential = (best_benchmark['performance'] - recent_performance) / (sport_max + 1e-6)

            return {
                'sport': sport,
                'benchmark_country': str(best_benchmark['NOC']),
                'current_performance': float(recent_performance),
                'improvement_potential': float(np.clip(improvement_potential, 0, 1)),
                'historical_fit': float(target_fit),
                'estimated_medal_gain': {
                    'mean': float(best_benchmark['performance'] - recent_performance),
                    'range': f"{best_benchmark['uncertainty'][0]:.1f}-{best_benchmark['uncertainty'][1]:.1f}"
                },
                'benchmark_metrics': {
                    'avg_performance': float(best_benchmark['performance']),
                    'stability': float(best_benchmark['historical_fit']),
                    'experience_years': int(len(target_data))
                }
            }

        except Exception as e:
            print(f"Benchmark analysis error: {str(e)}")
            return None
    def _calculate_country_metrics(self, data: pd.DataFrame) -> Dict:
        """优化后的国家表现指标计算"""
        if data.empty:
            return {
                'avg_performance': 0.0,
                'recent_performance': 0.0,
                'stability': 0.0,
                'trend_score': 0.0,
                'consistency': 0.0,
                'experience': 0
            }
        
        try:
            years = data['Year'].values
            medals = data['Medal_Value'].values
            
            # 基础统计
            recent_perf = medals[-4:].mean() if len(medals) >= 4 else medals.mean()
            avg_perf = medals.mean()
            stability = 1 / (1 + np.std(medals))
            
            # 趋势分析，增加错误处理
            try:
                if len(years) > 2:
                    X = (years - years.min()).reshape(-1, 1)
                    y = medals.reshape(-1, 1)
                    slope = np.polyfit(X.ravel(), y.ravel(), 1, rcond=1e-10)[0]
                    trend_score = np.clip(slope / (avg_perf + 1), -1, 1)
                else:
                    trend_score = 0.0
            except:
                trend_score = 0.0
            
            # 一致性评估
            consistency = 1 - np.std(medals) / (avg_perf + 1)
            
            return {
                'avg_performance': avg_perf,
                'recent_performance': recent_perf,
                'stability': stability,
                'trend_score': float(trend_score),
                'consistency': consistency,
                'experience': len(years)
            }
        except Exception as e:
            return {
                'avg_performance': 0.0,
                'recent_performance': 0.0,
                'stability': 0.0,
                'trend_score': 0.0,
                'consistency': 0.0,
                'experience': 0
            }

    def analyze_great_coach_effect(self,
                                 athletes_df: pd.DataFrame,
                                 medals_df: pd.DataFrame) -> Dict:
        """分析"伟大教练"效应"""
        results = {}

        # 选择重点分析的运动项目
        focus_sports = ['Gymnastics', 'Swimming', 'Athletics', 'Volleyball']

        # 分析每个重点项目
        for sport in focus_sports:
            sport_results = {}

            # 获取该项目的主要参赛国
            top_countries = athletes_df[
                athletes_df['Sport'] == sport
            ]['NOC'].value_counts().head(10).index

            for country in top_countries:
                impacts = self.detect_coach_effect_periods(athletes_df, country, sport)
                if impacts:
                    sport_results[country] = impacts

            results[sport] = sport_results

        return results

    def recommend_coach_investments(self, athletes_df: pd.DataFrame, medals_df: pd.DataFrame,
                                    countries: List[str]) -> Dict:
        """增强版教练投资建议系统"""

        def analyze_historical_data(data: pd.DataFrame, country: str, sport: str) -> Dict:
            """分析历史数据并计算关键指标"""
            print(f"\n分析 {country} 在 {sport} 项目的历史数据:")

            # 检查原始数据
            print(f"数据集中unique的Sport值: {data['Sport'].unique()}")
            print(f"数据集中unique的NOC值: {data['NOC'].unique()}")

            # 放宽匹配条件，使用模糊匹配
            sport_mask = data['Sport'].str.contains(sport, case=False, na=False)
            country_mask = data['NOC'].str.contains(country, case=False, na=False)

            country_data = data[sport_mask & country_mask].copy()

            print(f"找到 {len(country_data)} 条原始数据记录")
            if len(country_data) == 0:
                print(f"尝试更广泛的搜索...")
                # 尝试查找可能的运动项目名称变体
                possible_sports = data[data['Sport'].str.contains(sport[:4], case=False, na=False)]['Sport'].unique()
                print(f"可能的运动项目: {possible_sports}")

                # 尝试查找可能的国家代码变体
                possible_countries = data[data['NOC'].str.contains(country[:2], case=False, na=False)]['NOC'].unique()
                print(f"可能的国家代码: {possible_countries}")

                # 使用更宽松的匹配
                sport_mask = data['Sport'].isin(possible_sports)
                country_mask = data['NOC'].isin(possible_countries)
                country_data = data[sport_mask & country_mask].copy()
                print(f"宽松匹配后找到 {len(country_data)} 条记录")

            if len(country_data) == 0:
                print(f"警告: {country} 在 {sport} 项目中没有数据")
                return create_empty_metrics()

            # 验证数据的完整性
            print("\n数据验证:")
            print(f"年份范围: {country_data['Year'].min()} - {country_data['Year'].max()}")
            print(f"Medal_Value统计: \n{country_data['Medal_Value'].describe()}")

            # 计算年度统计
            yearly_medals = country_data.groupby('Year')['Medal_Value'].sum()
            print(f"\n年度奖牌统计:\n{yearly_medals}")

            # 处理异常值
            q1, q3 = yearly_medals.quantile([0.25, 0.75])
            iqr = q3 - q1
            upper_bound = q3 + 1.5 * iqr
            yearly_medals = yearly_medals.clip(upper=upper_bound)

            # 计算近期表现 (最近8年)
            recent_years = yearly_medals.index >= yearly_medals.index.max() - 8
            recent_performance = yearly_medals[recent_years].mean() if any(recent_years) else 0
            print(f"近期平均表现: {recent_performance}")

            # 计算趋势
            if len(yearly_medals) >= 2:
                years = np.array(range(len(yearly_medals)))
                trend = np.polyfit(years, yearly_medals.values, 1)[0]
            else:
                trend = 0
            print(f"趋势系数: {trend}")

            # 计算历史适配度指标
            total_medals = yearly_medals.sum()
            n_years = len(yearly_medals)
            span_years = yearly_medals.index.max() - yearly_medals.index.min() + 4
            participation_rate = n_years * 4 / span_years if span_years > 0 else 0

            # 计算稳定性
            consistency = 1 / (1 + yearly_medals.std()) if len(yearly_medals) > 1 else 0

            print(f"""
        详细统计:
        - 总奖牌数: {total_medals}
        - 参赛年数: {n_years}
        - 参与率: {participation_rate:.2f}
        - 稳定性: {consistency:.2f}
        """)

            # 综合历史适配度
            historical_fit = (
                    0.4 * (total_medals / (n_years + 1)) / 10 +  # 归一化平均奖牌数
                    0.3 * participation_rate +
                    0.3 * consistency
            )

            metrics = {
                'peak': float(yearly_medals.max()),
                'recent': float(recent_performance),
                'trend': float(trend),
                'years': int(n_years),
                'historical_fit': float(historical_fit),
                'consistency': float(consistency),
                'total_medals': int(total_medals)
            }

            print(f"最终指标:\n{metrics}")
            return metrics

        def create_empty_metrics():
            """创建空指标"""
            return {
                'peak': 0,
                'recent': 0,
                'trend': 0,
                'years': 0,
                'historical_fit': 0,
                'consistency': 0,
                'total_medals': 0
            }

        def evaluate_improvement_potential(target: Dict, benchmark: Dict) -> float:
            """评估改进潜力"""
            print(f"\n评估改进潜力:")
            print(f"目标指标: {target}")
            print(f"标杆指标: {benchmark}")

            # 如果是新项目或历史表现较弱
            if target['total_medals'] < 3:
                base_potential = 0.3
                print(f"新项目或历史较弱，基础潜力: {base_potential}")
                return base_potential

            # 计算相对差距
            relative_gap = (benchmark['recent'] - target['recent']) / (benchmark['recent'] + 1)
            print(f"相对差距: {relative_gap}")

            # 趋势因子
            trend_factor = np.tanh(max(0, benchmark['trend'] - target['trend']))
            print(f"趋势因子: {trend_factor}")

            # 历史因子
            historical_factor = benchmark['historical_fit'] / (target['historical_fit'] + 0.1)
            print(f"历史因子: {historical_factor}")

            # 综合评分
            potential = (
                    0.5 * relative_gap +
                    0.3 * trend_factor +
                    0.2 * min(historical_factor, 2)  # 限制历史因子的影响
            )

            final_potential = float(np.clip(potential, 0, 1))
            print(f"最终潜力评分: {final_potential}")
            return final_potential

        try:
            recommendations = {}
            print(f"\n开始生成教练投资建议...")

            # 基础运动项目
            sports = ['Swimming', 'Athletics', 'Gymnastics', 'Volleyball']

            # 合理的最大奖牌预期
            max_medals = {
                'Swimming': 15,
                'Athletics': 12,
                'Gymnastics': 10,
                'Volleyball': 8
            }

            for country in countries:
                print(f"\n分析 {country} 的投资机会:")
                country_recommendations = []

                for sport in sports:
                    print(f"\n评估 {sport} 项目:")
                    try:
                        # 获取目标国家数据
                        target_metrics = analyze_historical_data(athletes_df, country, sport)

                        # 获取标杆国家数据
                        benchmark_data = athletes_df[
                            (athletes_df['Sport'] == sport) &
                            (athletes_df['NOC'] != country) &
                            (~athletes_df['NOC'].isin(['URS', 'GDR', 'FRG', 'TCH', 'YUG']))
                            ]

                        # 选择最佳表现的国家作为标杆
                        benchmark_country = benchmark_data.groupby('NOC')['Medal_Value'].sum().nlargest(1).index[0]
                        print(f"选择的标杆国家: {benchmark_country}")

                        benchmark_metrics = analyze_historical_data(athletes_df, benchmark_country, sport)

                        # 计算潜力
                        potential = evaluate_improvement_potential(target_metrics, benchmark_metrics)

                        # 计算预期奖牌增长
                        medal_gain = min(
                            max_medals[sport],
                            benchmark_metrics['recent'] - target_metrics['recent']
                        )

                        if potential >= 0.1:  # 降低潜力阈值
                            country_recommendations.append({
                                'sport': sport,
                                'benchmark_country': benchmark_country,
                                'current_performance': float(target_metrics['recent']),
                                'improvement_potential': float(potential),
                                'historical_fit': float(target_metrics['historical_fit']),
                                'estimated_medal_gain': {
                                    'mean': float(medal_gain),
                                    'range': f"{(medal_gain * 0.85):.1f}-{(medal_gain * 1.15):.1f}"
                                },
                                'benchmark_metrics': {
                                    'avg_performance': float(benchmark_metrics['recent']),
                                    'trend': float(benchmark_metrics['trend']),
                                    'experience_years': int(benchmark_metrics['years'])
                                }
                            })

                    except Exception as e:
                        print(f"处理 {sport} 时出错: {str(e)}")
                        continue

                recommendations[country] = sorted(
                    country_recommendations,
                    key=lambda x: x['improvement_potential'],
                    reverse=True
                )

            return recommendations

        except Exception as e:
            print(f"推荐系统错误: {str(e)}")
            return {}
    def _find_benchmark_country(self, 
                              data: pd.DataFrame, 
                              sport: str, 
                              exclude_country: str) -> str:
        """找到某个运动项目的标杆国家"""
        performance = data[
            (data['Sport'] == sport) & 
            (data['NOC'] != exclude_country)
        ].groupby('NOC')['Medal_Value'].mean()
        
        return performance.nlargest(1).index[0] if not performance.empty else None

    def _calculate_improvement_potential(self, 
                                      data: pd.DataFrame, 
                                      sport: str, 
                                      country: str, 
                                      benchmark_country: str) -> float:
        """计算潜在提升空间"""
        country_perf = data[
            (data['Sport'] == sport) & 
            (data['NOC'] == country)
        ]['Medal_Value'].mean()
        
        benchmark_perf = data[
            (data['Sport'] == sport) & 
            (data['NOC'] == benchmark_country)
        ]['Medal_Value'].mean()
        
        return max(0, benchmark_perf - country_perf)

    def generate_report(self, coach_effects: Dict, recommendations: Dict) -> str:
        """优化的报告生成器"""
        report_lines = []

        # 1. 教练效应分析
        report_lines.extend([
            "1. '伟大教练'效应分析",
            "-" * 50,
            ""
        ])

        for sport, countries_data in coach_effects.items():
            if countries_data:  # Only add sport section if there's data
                report_lines.append(f"\n{sport}:")
                for country, impacts in countries_data.items():
                    for impact in impacts:
                        report_lines.append(
                            f"  - {country} ({impact.period[0]}-{impact.period[1]}): "
                            f"奖牌变化 {impact.medal_change:.1f}, "
                            f"显著性 {impact.significance:.2f}, "
                            f"一致性 {impact.consistency:.2f}"
                        )

        # 2. 教练投资建议
        report_lines.extend([
            "\n\n2. 教练投资建议",
            "-" * 50,
            ""
        ])

        for country, recommendations_list in recommendations.items():
            report_lines.append(f"\n{country}的优先投资项目:")
            if recommendations_list:  # Check if there are recommendations
                for rec in recommendations_list:
                    report_lines.extend([
                        f"  - {rec['sport']}:",
                        f"    * 标杆国家: {rec['benchmark_country']}",
                        f"    * 当前水平: {rec['current_performance']:.2f}",
                        f"    * 提升潜力: {rec['improvement_potential']:.2f}",
                        f"    * 历史适配度: {rec['historical_fit']:.2f}",
                        f"    * 预期奖牌增长: {rec['estimated_medal_gain']['mean']:.2f} ({rec['estimated_medal_gain']['range']})",
                        f"    * 标杆国绩效:",
                        f"      - 平均表现: {rec['benchmark_metrics']['avg_performance']:.2f}",
                        f"      - 发展趋势: {rec['benchmark_metrics']['trend']:.2f}",
                        f"      - 发展年限: {rec['benchmark_metrics']['experience_years']}"
                    ])
            else:
                report_lines.append("  暂无优先投资建议")

        return "\n".join(report_lines)

    def _detect_numerical_issues(self, label: str, matrix: np.ndarray):
        """数值问题诊断"""
        issues = []
        if np.any(np.isnan(matrix)):
            issues.append(f"{label} 包含NaN")
        if np.any(np.isinf(matrix)):
            issues.append(f"{label} 包含Inf")
        if np.any(np.abs(matrix) < 1e-10):
            issues.append(f"{label} 包含接近零值")
        if len(issues) > 0:
            print(f"数值问题 - {', '.join(issues)}")
            print(f"矩阵统计: 形状{matrix.shape}, 最小值{np.min(matrix)}, 最大值{np.max(matrix)}")
def main():
    console = Console()
    
    try:
        # 初始化分析器
        analyzer = CoachImpactAnalyzer()
        
        # 加载数据
        console.print("[bold cyan]加载数据...[/bold cyan]")
        athletes_df, medals_df = analyzer.load_and_prepare_data()
        
        # 分析教练效应
        console.print("[bold cyan]分析'伟大教练'效应...[/bold cyan]")
        coach_effects = analyzer.analyze_great_coach_effect(athletes_df, medals_df)
        
        # 为特定国家生成建议
        target_countries = ['France', 'Germany', 'Italy']  # 示例国家
        console.print("[bold cyan]生成教练投资建议...[/bold cyan]")
        recommendations = analyzer.recommend_coach_investments(
            athletes_df, medals_df, target_countries
        )
        
        # 生成报告
        report = analyzer.generate_report(coach_effects, recommendations)
        
        # 保存报告
        output_dir = Path("analysis_results")
        output_dir.mkdir(exist_ok=True)
        
        with open(output_dir / "coach_impact_analysis_report.txt", "w", encoding='utf-8') as f:
            f.write(report)
        
        # 显示报告
        console.print("\n[bold green]分析报告:[/bold green]")
        console.print(report)
        
    except Exception as e:
        console.print(f"[bold red]错误: {str(e)}[/bold red]")
        import traceback
        console.print(traceback.format_exc())
        raise e

if __name__ == "__main__":
    main()

使用教程https://www.yuque.com/appoint-igkhv/ex5pwc/wy14bsvlfza64iqc

遇到任何使用问题都可以联系我们，24小时人工客服在线，所有信息我们都会回复，请不要随意退款，没问题请给我们打个五星好评，对客服评价也打个满意，下次续费有优惠，因为为虚拟物品，如果对商品不满意或者问题无法解决，退款时候退款售后类型请不要选退货退款，选择第一个退款即可（不想要了）

24ecd4d6-ec8d-47fe-884f-33d0e3bacb52

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from rich.console import Console
from rich.table import Table
from sklearn import clone
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
from typing import Dict, List, Tuple
import joblib
from scipy import stats
from scipy.optimize import minimize
from rich.progress import Progress
class OlympicMedalPredictor:
    def __init__(self):
        self.console = Console()
        self.models = {
            'gbm': GradientBoostingRegressor(
                n_estimators=200, 
                learning_rate=0.05,
                max_depth=4,
                random_state=42
            ),
            'rf': RandomForestRegressor(
                n_estimators=200,
                max_depth=6, 
                random_state=42
            ),
            'xgb': xgb.XGBRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=4,
                random_state=42
            ),
            'lgb': lgb.LGBMRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=4,
                min_child_samples=20,
                min_child_weight=1e-3,
                reg_lambda=1.0,
                reg_alpha=0.0,
                random_state=42
            )
        }
        self.model_weights = {}
        self.feature_importance = {}
        self.predictions_store = {}
        
    def prepare_data(self, features_df: pd.DataFrame, historical_data: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, pd.Series], List[str]]:
        """强化的数据准备函数，处理无穷值和异常值"""
        from sklearn.impute import SimpleImputer
        import numpy as np
        
        df = features_df.merge(
            historical_data[['NOC', 'Year', 'Gold', 'Total']], 
            on=['NOC', 'Year'], 
            how='left'
        )
        
        # 处理历史趋势特征
        for col in ['Gold', 'Total']:
            # 移动平均，使用min_periods避免NaN
            df[f'{col}_ma_4year'] = df.groupby('NOC')[col].transform(
                lambda x: x.rolling(4, min_periods=1).mean()
            ).fillna(0)
            
            # 安全的增长率计算
            def safe_pct_change(x):
                change = x.pct_change(4)
                # 将inf替换为上限值
                change = change.replace([np.inf, -np.inf], np.nan)
                # 使用90分位数作为极值上限
                upper_bound = np.nanpercentile(change, 90)
                return change.clip(lower=-1, upper=upper_bound)
                
            df[f'{col}_growth'] = df.groupby('NOC')[col].transform(safe_pct_change).fillna(0)
            df[f'historical_max_{col}'] = df.groupby('NOC')[col].transform('max').fillna(0)
        
        # 分层标签
        tier_1_countries = ['United States', 'China', 'Great Britain', 'Japan', 'Australia']
        tier_2_countries = ['France', 'Germany', 'Italy', 'Netherlands', 'South Korea']
        df['country_tier'] = (
            df['NOC'].apply(
                lambda x: 1 if x in tier_1_countries else (2 if x in tier_2_countries else 3)
            )
            .astype(int)  # 强制转换为整数类型
        )
        
        # 保存NOC列
        noc_series = df['NOC']
        
        # 分离特征和目标变量
        target_cols = ['Gold', 'Total']
        exclude_cols = ['NOC'] + target_cols
        feature_cols = [col for col in df.columns if col not in exclude_cols]
        
        # 提取特征矩阵
        X = df[feature_cols].copy()
        
        # 处理数值型特征
        num_cols = X.select_dtypes(include=['int64', 'float64']).columns
        if len(num_cols) > 0:
            # 先处理无穷值
            X[num_cols] = X[num_cols].replace([np.inf, -np.inf], np.nan)
            # 使用中位数填充缺失值
            num_imputer = SimpleImputer(strategy='median')
            X[num_cols] = num_imputer.fit_transform(X[num_cols])
        
        # 类别型特征处理
        cat_cols = X.select_dtypes(include=['object', 'category']).columns
        if len(cat_cols) > 0:
            cat_imputer = SimpleImputer(strategy='most_frequent')
            X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
        
        # 最终数据类型转换和清理
        X = X.astype(float)
        
        # 确保没有异常值
        for col in X.columns:
            upper_bound = np.percentile(X[col], 99)
            lower_bound = np.percentile(X[col], 1)
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        y = {
            'Gold': df['Gold'].fillna(0).astype(float),
            'Total': df['Total'].fillna(0).astype(float)
        }
        
        return X, y, feature_cols
    


    def _optimize_ensemble_weights(self, 
                                predictions: Dict[str, np.ndarray], 
                                target: pd.Series,
                                base_scores: Dict[str, float],
                                constraints: Dict) -> Dict[str, float]:
        """优化集成权重，考虑历史约束"""
        def objective(w):
            w = w / w.sum()
            ensemble_pred = np.zeros_like(list(predictions.values())[0])
            for i, (_, pred) in enumerate(predictions.items()):
                ensemble_pred += w[i] * pred
            
            # 添加约束惩罚
            penalty = 0
            if ensemble_pred.sum() < constraints['total_range'][0]:
                penalty += (constraints['total_range'][0] - ensemble_pred.sum()) ** 2
            elif ensemble_pred.sum() > constraints['total_range'][1]:
                penalty += (ensemble_pred.sum() - constraints['total_range'][1]) ** 2
            
            return -r2_score(target, ensemble_pred) + penalty * 0.1
        
        n_models = len(predictions)
        weights = np.array([base_scores[model] for model in predictions.keys()])
        weights = weights / weights.sum()
        
        constraints_opt = (
            {'type': 'eq', 'fun': lambda w: np.sum(w) - 1},
        )
        bounds = [(0, 1) for _ in range(n_models)]
        
        result = minimize(
            objective, 
            weights, 
            method='SLSQP',
            constraints=constraints_opt,
            bounds=bounds,
            options={'maxiter': 1000}
        )
        
        optimized_weights = result.x / result.x.sum()
        return dict(zip(predictions.keys(), optimized_weights))
    
    def _calculate_weights(self, scores: Dict[str, float]) -> Dict[str, float]:
        """计算模型权重"""
        total = sum(scores.values())
        weights = {model: score/total for model, score in scores.items()}
        return weights
    
    def train_models(self, X: pd.DataFrame, y: Dict[str, pd.Series]) -> Dict:
        trained_models = {}
        scores = {}
        
        historical_constraints = {
            'Gold': {
                'top_countries': {
                    'China': (35, 45), 'United States': (30, 40),
                    'Great Britain': (20, 30), 'ROC': (20, 30), 
                    'Japan': (20, 30)
                },
                'min': 0, 'max': 45, 'total_range': (300, 340)
            },
            'Total': {
                'top_countries': {
                    'China': (80, 100), 'United States': (80, 100),
                    'Great Britain': (50, 70), 'ROC': (50, 70),
                    'Japan': (50, 70)
                },
                'min': 0, 'max': 100, 'total_range': (950, 1100)
            }
        }
        
        with Progress() as progress:
            total_tasks = len(y) * len(self.models)
            train_progress = progress.add_task("[cyan]训练模型...", total=total_tasks)
            
            for target_name, target in y.items():
                if target is None:
                    continue
                    
                trained_models[target_name] = {}
                scores[target_name] = {}
                self.predictions_store[target_name] = {}
                self.model_weights[target_name] = {}
                
                tscv = TimeSeriesSplit(n_splits=5)
                
                for model_name, model in self.models.items():
                    model_scores = []
                    fold_predictions = []
                    fold_actuals = []
                    
                    # 模型参数设置
                    if model_name == 'xgb':
                        model.set_params(learning_rate=0.05, n_estimators=300)
                    elif model_name == 'lgb':
                        model.set_params(learning_rate=0.05, n_estimators=300) 
                    elif model_name == 'gbm':
                        model.set_params(learning_rate=0.05, n_estimators=300)
                    elif model_name == 'rf':
                        model.set_params(n_estimators=300)
                    
                    for train_idx, val_idx in tscv.split(X):
                        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                        y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]
                        
                        # 模型训练
                        if model_name in ['xgb', 'lgb']:
                            model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
                        else:
                            model.fit(X_train, y_train)
                        
                        pred = model.predict(X_val)
                        pred = np.clip(
                            pred,
                            historical_constraints[target_name]['min'],
                            historical_constraints[target_name]['max']
                        )
                        
                        fold_predictions.extend(pred)
                        fold_actuals.extend(y_val)
                        score = r2_score(y_val, pred)
                        model_scores.append(score)
                    
                    trained_models[target_name][model_name] = model
                    scores[target_name][model_name] = np.mean(model_scores)
                    
                    if hasattr(model, 'feature_importances_'):
                        self.feature_importance[f"{target_name}_{model_name}"] = pd.Series(
                            model.feature_importances_,
                            index=X.columns
                        ).sort_values(ascending=False)
                    
                    progress.update(train_progress, advance=1)
                
                weights = self._optimize_ensemble_weights(
                    {name: model.predict(X) for name, model in trained_models[target_name].items()},
                    target,
                    scores[target_name],
                    historical_constraints[target_name]
                )
                self.model_weights[target_name].update(weights)
        
        return trained_models

    def _calculate_optimal_weights(self, predictions: Dict, scores: Dict) -> Dict[str, float]:
        """计算最优权重"""
        base_weights = np.array([scores[model] for model in predictions.keys()])
        weights = base_weights / np.sum(base_weights)
        
        # 验证预测结果的表现
        ensemble_predictions = np.zeros_like(list(predictions.values())[0]['predictions'])
        for i, (model_name, _) in enumerate(predictions.items()):
            ensemble_predictions += weights[i] * predictions[model_name]['predictions']
        
        # 返回归一化的权重
        return dict(zip(predictions.keys(), weights))

    def predict_with_uncertainty(self, X_pred: pd.DataFrame, models: Dict, target_name: str, n_iterations: int = 500) -> Tuple[np.ndarray, np.ndarray]:
        """改进的预测函数，修复了权重应用和约束问题"""
        predictions = []
        
        # 更细致的历史基线设置
        historical_baselines = {
            'Gold': {
                'tier_1': {
                    'min': 25, 'max': 40,  # 适当降低上限
                    'noise': 0.03,  # 降低噪声
                    'countries': ['United States', 'China']
                },
                'tier_2': {
                    'min': 15, 'max': 25,
                    'noise': 0.05,
                    'countries': ['Great Britain', 'Japan', 'Australia']
                },
                'tier_3': {
                    'min': 8, 'max': 15,
                    'noise': 0.08,
                    'countries': ['France', 'Germany', 'Italy', 'Netherlands']
                },
                'tier_4': {
                    'min': 0, 'max': 8,
                    'noise': 0.10
                },
                'total_range': (300, 340)
            },
            'Total': {
                'tier_1': {
                    'min': 70, 'max': 110,
                    'noise': 0.03,
                    'countries': ['United States', 'China']
                },
                'tier_2': {
                    'min': 50, 'max': 70,
                    'noise': 0.05,
                    'countries': ['Great Britain', 'Japan', 'Australia']
                },
                'tier_3': {
                    'min': 30, 'max': 50,
                    'noise': 0.08,
                    'countries': ['France', 'Germany', 'Italy', 'Netherlands']
                },
                'tier_4': {
                    'min': 0, 'max': 30,
                    'noise': 0.10
                },
                'total_range': (950, 1100)
            }
        }

        with Progress() as progress:
            task = progress.add_task("[cyan]生成预测...", total=n_iterations)
            
            for _ in range(n_iterations):
                model_preds = {}
                
                # 为每个模型生成预测
                for model_name, model in models[target_name].items():
                    X_noisy = X_pred.copy()
                    
                    # 基于国家分层应用不同的噪声和约束
                    predictions_array = np.zeros(len(X_pred))
                    
                    for country_idx, country in enumerate(X_pred.index):
                        # 确定国家所属层级
                        tier = None
                        for tier_name in ['tier_1', 'tier_2', 'tier_3']:
                            if country in historical_baselines[target_name][tier_name]['countries']:
                                tier = tier_name
                                break
                        if tier is None:
                            tier = 'tier_4'
                        
                        # 获取该层级的参数
                        tier_params = historical_baselines[target_name][tier]
                        
                        # 生成带噪声的预测
                        noise = np.random.normal(0, tier_params['noise'])
                        X_noisy_row = X_noisy.iloc[[country_idx]] * (1 + noise)
                        pred = model.predict(X_noisy_row)[0]
                        
                        # 应用层级特定的约束
                        pred = np.clip(pred, tier_params['min'], tier_params['max'])
                        
                        # 东道主效应 (美国)
                        if country == 'United States':
                            pred *= 1.05  # 降低东道主加成
                        
                        predictions_array[country_idx] = pred
                    
                    model_preds[model_name] = predictions_array
                
                # 使用模型权重进行集成
                weights = np.array([self.model_weights[target_name].get(name, 1.0/len(model_preds)) 
                                for name in model_preds.keys()])
                weights = weights / weights.sum()  # 确保权重和为1
                
                # 计算加权平均预测
                ensemble_pred = np.zeros(len(X_pred))
                for model_name, pred in model_preds.items():
                    ensemble_pred += pred * self.model_weights[target_name][model_name]
                
                # 应用总量约束
                total_min, total_max = historical_baselines[target_name]['total_range']
                if ensemble_pred.sum() < total_min:
                    scale_factor = total_min / ensemble_pred.sum()
                    ensemble_pred *= scale_factor
                elif ensemble_pred.sum() > total_max:
                    scale_factor = total_max / ensemble_pred.sum()
                    ensemble_pred *= scale_factor
                
                predictions.append(ensemble_pred)
                progress.update(task, advance=1)
        
        predictions = np.array(predictions)
        mean_pred = np.mean(predictions, axis=0)
        std_pred = np.std(predictions, axis=0)
        
        return mean_pred, std_pred
    def identify_first_time_medals(self, 
                                 predictions: np.ndarray, 
                                 historical_data: pd.DataFrame,
                                 countries: List[str]) -> List[str]:
        """识别可能首次获得奖牌的国家"""
        # 获取历史上从未获得奖牌的国家
        historical_medals = historical_data.groupby('NOC')['Total'].sum()
        never_medaled = set(countries) - set(historical_medals[historical_medals > 0].index)
        
        # 预测值大于阈值的国家可能首次获得奖牌
        threshold = 0.5  # 可调整的阈值
        first_time_medals = []
        
        for country, pred in zip(countries, predictions):
            if country in never_medaled and pred > threshold:
                first_time_medals.append(country)
        
        return first_time_medals
    
    def predict_2028_olympics(self, features_df: pd.DataFrame, historical_data: pd.DataFrame) -> None:
        """改进的预测主函数"""
        try:
            # 准备训练数据
            X, y, feature_cols = self.prepare_data(features_df, historical_data)
            
            # 训练模型
            self.console.print("\n[bold cyan]训练模型中...[/bold cyan]")
            trained_models = self.train_models(X, y)
            
            # 准备2028年预测数据
            X_2028 = self._prepare_2028_features(features_df, historical_data)
            
            # 确保特征列一致性
            X_2028_features = X_2028[feature_cols]
            
            # 预测并计算不确定性
            self.console.print("\n[bold cyan]生成2028年预测...[/bold cyan]")
            results = {}
            for target in ['Gold', 'Total']:
                if target in trained_models:
                    mean_pred, std_pred = self.predict_with_uncertainty(
                        X_2028_features,
                        trained_models,
                        target
                    )
                    results[target] = {
                        'predictions': mean_pred,
                        'uncertainty': std_pred
                    }
            
            # 识别可能首次获得奖牌的国家
            first_time_medalists = self.identify_first_time_medals(
                results['Total']['predictions'],
                historical_data,
                X_2028['NOC'].unique()
            )
            
            # 输出预测结果
            self._display_predictions(results, X_2028['NOC'].unique(), first_time_medalists)
            
            # 保存模型和预测结果
            self._save_results(trained_models, results, X_2028['NOC'].unique())
            
        except Exception as e:
            self.console.print(f"[bold red]预测过程中出现错误: {str(e)}[/bold red]")
            raise e
        
    def _prepare_2028_features(self, features_df: pd.DataFrame, historical_data: pd.DataFrame) -> pd.DataFrame:
        """改进的2028特征准备函数"""
        # 复制最近一年的数据
        latest_year = features_df['Year'].max()
        X_2028 = features_df[features_df['Year'] == latest_year].copy()
        X_2028['Year'] = 2028
        
        # 计算历史特征
        for col in ['Gold', 'Total']:
            # 计算移动平均
            X_2028[f'{col}_ma_4year'] = historical_data.groupby('NOC')[col].transform(
                lambda x: x.rolling(4, min_periods=1).mean()
            ).fillna(0)
            
            # 安全的增长率计算
            def safe_pct_change(x):
                change = x.pct_change(4)
                change = change.replace([np.inf, -np.inf], np.nan)
                upper_bound = np.nanpercentile(change[~np.isnan(change)], 90)
                return change.clip(lower=-1, upper=upper_bound)
                
            X_2028[f'{col}_growth'] = historical_data.groupby('NOC')[col].transform(safe_pct_change).fillna(0)
            X_2028[f'historical_max_{col}'] = historical_data.groupby('NOC')[col].transform('max').fillna(0)
        
        # 添加国家分层
        tier_1_countries = ['United States', 'China', 'Great Britain', 'Japan', 'Australia']
        tier_2_countries = ['France', 'Germany', 'Italy', 'Netherlands', 'South Korea']
        X_2028['country_tier'] = (
            X_2028['NOC'].apply(
                lambda x: 1 if x in tier_1_countries else (2 if x in tier_2_countries else 3)
            )
            .astype(int)  # 强制转换为整数类型
        )
        
        return X_2028
    
    def _display_predictions(self, 
                        results: Dict, 
                        countries: np.ndarray, 
                        first_time_medalists: List[str]) -> None:
        """显示预测结果"""
        # 创建结果表格
        table = Table(title="2028洛杉矶奥运会奖牌预测")
        table.add_column("国家")
        table.add_column("预计金牌数")
        table.add_column("预计总奖牌数")
        table.add_column("预测不确定性")
        
        # 将countries转换为列表以使用index方法
        countries_list = list(countries)
        
        for i, country in enumerate(countries_list):
            gold_pred = f"{results['Gold']['predictions'][i]:.1f}"
            gold_std = f"±{results['Gold']['uncertainty'][i]:.1f}"
            total_pred = f"{results['Total']['predictions'][i]:.1f}"
            total_std = f"±{results['Total']['uncertainty'][i]:.1f}"
            
            table.add_row(
                country,
                f"{gold_pred} ({gold_std})",
                f"{total_pred} ({total_std})",
                "高" if country in first_time_medalists else "中"
            )
        
        self.console.print(table)
        
        # 显示首次获奖国家
        if first_time_medalists:
            self.console.print("\n[bold green]预计首次获得奖牌的国家:[/bold green]")
            for country in first_time_medalists:
                self.console.print(f"- {country}")
    
    def _save_results(self, 
                    models: Dict, 
                    results: Dict, 
                    countries: List[str]) -> None:
        """保存模型和预测结果"""
        # 创建保存目录
        save_dir = Path("models")
        save_dir.mkdir(exist_ok=True)
        
        # 保存模型
        for target, target_models in models.items():
            for model_name, model in target_models.items():
                joblib.dump(
                    model, 
                    save_dir / f"{target}_{model_name}_model.joblib"
                )
        
        # 保存预测结果
        predictions_df = pd.DataFrame({
            'Country': countries,
            'Predicted_Gold': results['Gold']['predictions'],
            'Gold_Uncertainty': results['Gold']['uncertainty'],
            'Predicted_Total': results['Total']['predictions'],
            'Total_Uncertainty': results['Total']['uncertainty']
        })
        
        # 同时保存为 CSV 和 Parquet 格式
        predictions_df.to_csv(save_dir / "predictions_2028.csv", index=False)
        predictions_df.to_parquet(save_dir / "predictions_2028.parquet", index=False)
        
        # 保存特征重要性
        importance_df = pd.DataFrame(self.feature_importance)
        importance_df.to_csv(save_dir / "feature_importance.csv", index=True)
        importance_df.to_parquet(save_dir / "feature_importance.parquet", index=True)

    def generate_summary_report(self, predictions_df: pd.DataFrame, historical_data: pd.DataFrame) -> str:
        """生成详细的预测评估报告"""
        summary = []
        
        # 1. 基本统计分析
        total_countries = len(predictions_df)
        avg_gold = predictions_df['Predicted_Gold'].mean()
        avg_total = predictions_df['Predicted_Total'].mean()
        total_gold = predictions_df['Predicted_Gold'].sum()
        total_medals = predictions_df['Predicted_Total'].sum()
        
        summary.append("1. 基本统计分析")
        summary.append(f"   - 预测国家数量: {total_countries}")
        summary.append(f"   - 平均预测金牌数: {avg_gold:.2f}")
        summary.append(f"   - 平均预测总奖牌数: {avg_total:.2f}")
        summary.append(f"   - 预测总金牌数: {total_gold:.2f}")
        summary.append(f"   - 预测总奖牌数: {total_medals:.2f}")
        
        # 2. 历史趋势分析
        recent_years = historical_data['Year'].unique()[-3:]
        historical_trends = []
        for year in recent_years:
            year_data = historical_data[historical_data['Year'] == year]
            historical_trends.append({
                'year': year,
                'total_gold': year_data['Gold'].sum(),
                'total_medals': year_data['Total'].sum()
            })
        
        summary.append("\n2. 历史趋势分析")
        for trend in historical_trends:
            summary.append(f"   - {trend['year']}年:")
            summary.append(f"     * 总金牌数: {trend['total_gold']}")
            summary.append(f"     * 总奖牌数: {trend['total_medals']}")
        
        # 3. 预测可信度评估
        gold_uncertainty = predictions_df['Gold_Uncertainty'].mean()
        total_uncertainty = predictions_df['Total_Uncertainty'].mean()
        
        summary.append("\n3. 预测可信度评估")
        summary.append(f"   - 金牌预测平均不确定性: ±{gold_uncertainty:.2f}")
        summary.append(f"   - 总奖牌预测平均不确定性: ±{total_uncertainty:.2f}")
        
        # 4. 主要发现
        summary.append("\n4. 主要发现")
        summary.append("   - 预测趋势与历史数据对比")
        summary.append("   - 国家间竞争格局变化")
        summary.append("   - 新兴运动强国分析")
        
        # 5. 预测局限性
        summary.append("\n5. 预测局限性")
        summary.append("   - 模型假设和约束")
        summary.append("   - 不确定性来源")
        summary.append("   - 潜在影响因素")
        
        return "\n".join(summary)
def main():
    console = Console()
    
    try:
        # 创建保存目录
        Path("models").mkdir(exist_ok=True)
        
        # 加载数据
        console.print("[bold cyan]加载数据...[/bold cyan]")
        
        # 尝试不同的数据加载方式
        def load_data(file_path_base):
            """尝试多种方式加载数据"""
            # 尝试不同的文件扩展名和编码
            attempts = [
                (f"{file_path_base}.parquet", lambda x: pd.read_parquet(x)),
                (f"{file_path_base}.csv", lambda x: pd.read_csv(x)),
                (f"{file_path_base}.csv", lambda x: pd.read_csv(x, encoding='utf-8')),
                (f"{file_path_base}.csv", lambda x: pd.read_csv(x, encoding='latin1'))
            ]
            
            last_error = None
            for file_path, reader in attempts:
                try:
                    if Path(file_path).exists():
                        data = reader(file_path)
                        console.print(f"[green]成功从 {file_path} 加载数据[/green]")
                        return data
                except Exception as e:
                    last_error = e
                    continue
            
            raise FileNotFoundError(f"无法加载数据文件 {file_path_base}.*\n最后的错误: {str(last_error)}")
        
        # 加载特征数据
        features_df = load_data("data/processed/features")
        historical_data = load_data("data/processed/medal_counts")
        
        # 数据验证
        required_columns = ['Year', 'NOC', 'Gold', 'Total']
        for col in required_columns:
            if col not in historical_data.columns:
                raise ValueError(f"历史数据缺少必要的列: {col}")
        
        # 显示数据基本信息
        console.print("\n[bold green]数据加载完成[/bold green]")
        console.print(f"特征数据形状: {features_df.shape}")
        console.print(f"特征列: {', '.join(features_df.columns)}")
        console.print(f"历史数据形状: {historical_data.shape}")
        console.print(f"历史数据列: {', '.join(historical_data.columns)}")
        
        # 检查数据质量
        console.print("\n[bold cyan]检查数据质量...[/bold cyan]")
        
        # 检查缺失值
        missing_features = features_df.isnull().sum()
        if missing_features.any():
            console.print("[yellow]特征数据中存在缺失值:[/yellow]")
            console.print(missing_features[missing_features > 0])
        
        missing_historical = historical_data.isnull().sum()
        if missing_historical.any():
            console.print("[yellow]历史数据中存在缺失值:[/yellow]")
            console.print(missing_historical[missing_historical > 0])
        
        # 初始化预测器
        predictor = OlympicMedalPredictor()
        
        # 运行预测
        predictor.predict_2028_olympics(features_df, historical_data)
        # 加载预测结果
        predictions_df = pd.read_parquet("models/predictions_2028.parquet")
        
        # 生成摘要报告
        summary_report = predictor.generate_summary_report(predictions_df, historical_data)
        
        # 保存报告
        with open("models/prediction_summary_report.txt", "w") as f:
            f.write(summary_report)
        
        console.print("\n[bold cyan]预测评估摘要:[/bold cyan]")
        console.print(summary_report)
        # 保存结果
        console.print("\n[bold green]预测完成！结果已保存到 models 目录[/bold green]")
        
    except Exception as e:
        console.print(f"[bold red]错误: {str(e)}[/bold red]")
        import traceback
        console.print(traceback.format_exc())
        raise e

if __name__ == "__main__":
    main()
