In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
import os
import joblib
import time
from datetime import datetime, timedelta
import requests
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Create directories
os.makedirs('solar_analysis_results', exist_ok=True)
os.makedirs('solar_analysis_results/plots', exist_ok=True)
os.makedirs('solar_analysis_results/models', exist_ok=True)
os.makedirs('solar_analysis_results/comparisons', exist_ok=True)

class ComprehensiveSolarForecaster:

    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.models = {}
        self.features = None
        self.target = None
        self.scaler = None
        self.best_params = {}
        self.weather_features = ['cloud_cover', 'relative_humidity', 'wind_speed',
                               'air_pressure', 'visibility', 'precipitation', 'temperature']

    def load_and_preprocess(self):
        """Load and preprocess solar irradiance data with enhanced features"""
        print("Loading and preprocessing solar irradiance data...")

        self.df = pd.read_csv(self.data_path)

        # Create datetime column
        self.df['datetime'] = pd.to_datetime({
            'year': self.df['YEAR'],
            'month': self.df['MO'],
            'day': self.df['DY'],
            'hour': self.df['HR']
        })

        self.df = self.df.sort_values('datetime').reset_index(drop=True)
        self._create_comprehensive_features()
        self._display_dataset_info()

        return self.df

    def _create_comprehensive_features(self):
        """Create comprehensive temporal and cyclical features"""
        print("Creating comprehensive temporal features...")

        # Basic temporal features
        self.df['hour'] = self.df['datetime'].dt.hour
        self.df['day'] = self.df['datetime'].dt.day
        self.df['day_of_week'] = self.df['datetime'].dt.dayofweek
        self.df['month'] = self.df['datetime'].dt.month
        self.df['year'] = self.df['datetime'].dt.year
        self.df['day_of_year'] = self.df['datetime'].dt.dayofyear

        # Cyclical encoding
        self.df['hour_sin'] = np.sin(2 * np.pi * self.df['hour']/24)
        self.df['hour_cos'] = np.cos(2 * np.pi * self.df['hour']/24)
        self.df['month_sin'] = np.sin(2 * np.pi * self.df['month']/12)
        self.df['month_cos'] = np.cos(2 * np.pi * self.df['month']/12)
        self.df['day_of_year_sin'] = np.sin(2 * np.pi * self.df['day_of_year']/365)
        self.df['day_of_year_cos'] = np.cos(2 * np.pi * self.df['day_of_year']/365)
        self.df['day_of_week_sin'] = np.sin(2 * np.pi * self.df['day_of_week']/7)
        self.df['day_of_week_cos'] = np.cos(2 * np.pi * self.df['day_of_week']/7)

        # Season features (as numeric to avoid categorical interpolation issues)
        self.df['season_numeric'] = ((self.df['month'] % 12) // 3)  # 0=Winter, 1=Spring, 2=Summer, 3=Fall

        # Create season dummies directly
        self.df['is_spring'] = (self.df['month'].isin([3, 4, 5])).astype(int)
        self.df['is_summer'] = (self.df['month'].isin([6, 7, 8])).astype(int)
        self.df['is_fall'] = (self.df['month'].isin([9, 10, 11])).astype(int)
        # Winter is the baseline (when all other seasons are 0)

        # Binary indicators
        self.df['is_weekend'] = self.df['day_of_week'].isin([5, 6]).astype(int)
        self.df['is_winter'] = self.df['month'].isin([12, 1, 2]).astype(int)

        # Lag features
        self.df['prev_hour_irradiance'] = self.df['ALLSKY_SFC_SW_DWN'].shift(1)
        self.df['prev_day_irradiance'] = self.df['ALLSKY_SFC_SW_DWN'].shift(24)
        self.df['prev_week_irradiance'] = self.df['ALLSKY_SFC_SW_DWN'].shift(24*7)

        # Rolling features
        self.df['24h_rolling_avg'] = self.df['ALLSKY_SFC_SW_DWN'].rolling(window=24, center=True).mean()
        self.df['7d_rolling_avg'] = self.df['ALLSKY_SFC_SW_DWN'].rolling(window=24*7, center=True).mean()

        # Fill NaN values
        self.df = self.df.fillna(method='bfill')
        self.df = self.df.fillna(method='ffill')

    def _display_dataset_info(self):
        print(f"Dataset shape: {self.df.shape}")
        print(f"Date range: {self.df['datetime'].min()} to {self.df['datetime'].max()}")
        print(f"Total duration: {(self.df['datetime'].max() - self.df['datetime'].min()).days} days")
        print(f"Years covered: {sorted(self.df['year'].unique())}")
        print(f"Mean solar irradiance: {self.df['ALLSKY_SFC_SW_DWN'].mean():.2f} W/m²")
        print(f"Max solar irradiance: {self.df['ALLSKY_SFC_SW_DWN'].max():.2f} W/m²")
        print(f"{'='*60}")

    def prepare_features(self, target='ALLSKY_SFC_SW_DWN', include_weather=True):
        """Prepare features for modeling"""
        print("Preparing features for modeling...")

        base_features = [
            'hour', 'day', 'day_of_week', 'month', 'day_of_year',
            'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
            'day_of_year_sin', 'day_of_year_cos', 'day_of_week_sin', 'day_of_week_cos',
            'is_weekend', 'is_summer', 'is_winter', 'season_numeric',
            'is_spring', 'is_fall'
        ]

        lag_features = ['prev_hour_irradiance', 'prev_day_irradiance',
                       'prev_week_irradiance', '24h_rolling_avg', '7d_rolling_avg']

        self.features = base_features + lag_features

        # Add weather features if available and requested
        if include_weather:
            available_weather = [f for f in self.weather_features if f in self.df.columns]
            self.features.extend(available_weather)
            print(f"Added weather features: {available_weather}")

        self.target = target
        print(f"Total features: {len(self.features)}")
        return self.features, self.target

    def split_data_temporal(self, test_size=0.2, validation_size=0.2):
        """Temporal data splitting for time series"""
        X = self.df[self.features].copy().fillna(0)
        y = self.df[self.target].copy()

        n_samples = len(X)
        n_test = int(n_samples * test_size)
        n_val = int(n_samples * validation_size)
        n_train = n_samples - n_test - n_val

        X_train, X_val, X_test = X[:n_train], X[n_train:n_train+n_val], X[n_train+n_val:]
        y_train, y_val, y_test = y[:n_train], y[n_train:n_train+n_val], y[n_train+n_val:]

        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_val_scaled = self.scaler.transform(X_val)
        X_test_scaled = self.scaler.transform(X_test)

        print(f"Training: {len(X_train)} | Validation: {len(X_val)} | Test: {len(X_test)}")
        return X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, y_test

    def compare_models(self, models_to_compare=None):
        """Compare multiple ML models comprehensively"""
        print("\n" + "="*60)
        print("COMPREHENSIVE MODEL COMPARISON")
        print("="*60)

        if models_to_compare is None:
            models_to_compare = ['random_forest', 'xgboost', 'svm', 'gradient_boosting', 'neural_network']

        # Prepare data
        self.prepare_features()
        X_train, X_val, X_test, y_train, y_val, y_test = self.split_data_temporal()

        results = []

        for model_name in models_to_compare:
            print(f"\nTraining {model_name.upper()} model...")
            start_time = time.time()

            if model_name == 'random_forest':
                model = self._build_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)
            elif model_name == 'xgboost':
                model = self._build_xgboost(X_train, y_train, X_val, y_val, X_test, y_test)
            elif model_name == 'svm':
                model = self._build_svm(X_train, y_train, X_val, y_val, X_test, y_test)
            elif model_name == 'gradient_boosting':
                model = self._build_gradient_boosting(X_train, y_train, X_val, y_val, X_test, y_test)
            elif model_name == 'neural_network':
                model = self._build_neural_network(X_train, y_train, X_val, y_val, X_test, y_test)

            training_time = time.time() - start_time
            metrics = self.models[model_name]['metrics']
            metrics['training_time'] = training_time

            results.append({
                'model': model_name,
                'train_rmse': metrics['train_rmse'],
                'test_rmse': metrics['test_rmse'],
                'test_mae': metrics['test_mae'],
                'test_r2': metrics['test_r2'],
                'training_time': training_time
            })

        results_df = pd.DataFrame(results)
        self._plot_model_comparison(results_df)
        self._generate_model_comparison_report(results_df)

        return results_df

    def _build_random_forest(self, X_train, y_train, X_val, y_val, X_test, y_test):
        """Build Random Forest model with hyperparameter optimization"""
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        rf = RandomForestRegressor(random_state=42, n_jobs=-1)
        tscv = TimeSeriesSplit(n_splits=5)

        grid_search = GridSearchCV(rf, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        self.best_params['random_forest'] = grid_search.best_params_

        # Train and predict
        best_model.fit(X_train, y_train)
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        # Calculate metrics
        metrics = self._calculate_metrics(y_train, y_train_pred, y_test, y_test_pred)

        self.models['random_forest'] = {'model': best_model, 'metrics': metrics}
        return best_model

    def _build_xgboost(self, X_train, y_train, X_val, y_val, X_test, y_test):
        try:
            from xgboost import XGBRegressor

            model = XGBRegressor(
                n_estimators=200, learning_rate=0.1, max_depth=6,
                subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
            )

            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

            metrics = self._calculate_metrics(y_train, y_train_pred, y_test, y_test_pred)
            self.models['xgboost'] = {'model': model, 'metrics': metrics}

            return model
        except ImportError:
            print("XGBoost not available. Install with: pip install xgboost")
            return None

    def _build_svm(self, X_train, y_train, X_val, y_val, X_test, y_test):
        """Build SVM model"""
        model = SVR(kernel='rbf', C=100, epsilon=0.1, gamma='scale')

        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        metrics = self._calculate_metrics(y_train, y_train_pred, y_test, y_test_pred)
        self.models['svm'] = {'model': model, 'metrics': metrics}

        return model

    def _build_gradient_boosting(self, X_train, y_train, X_val, y_val, X_test, y_test):
        model = GradientBoostingRegressor(
            n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42
        )

        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        metrics = self._calculate_metrics(y_train, y_train_pred, y_test, y_test_pred)
        self.models['gradient_boosting'] = {'model': model, 'metrics': metrics}

        return model

    def _build_neural_network(self, X_train, y_train, X_val, y_val, X_test, y_test):
        model = MLPRegressor(
            hidden_layer_sizes=(100, 50), activation='relu', solver='adam',
            alpha=0.001, batch_size='auto', learning_rate='constant',
            learning_rate_init=0.001, max_iter=1000, random_state=42
        )

        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        metrics = self._calculate_metrics(y_train, y_train_pred, y_test, y_test_pred)
        self.models['neural_network'] = {'model': model, 'metrics': metrics}

        return model

    def _calculate_metrics(self, y_train, y_train_pred, y_test, y_test_pred):
        return {
            'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
            'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred)),
            'test_mae': mean_absolute_error(y_test, y_test_pred),
            'test_r2': r2_score(y_test, y_test_pred),
            'test_mape': np.mean(np.abs((y_test - y_test_pred) / (y_test + 1e-8))) * 100
        }

    def test_geographic_transferability(self, locations_list, base_model_name='random_forest'):
        results = []

        # Train base model on first location
        base_location = locations_list[0]
        print(f"Training base model on {base_location['name']} data...")

        self.data_path = base_location['data_path']
        self.load_and_preprocess()
        self.prepare_features()
        X_train, X_val, X_test, y_train, y_val, y_test = self.split_data_temporal()

        if base_model_name == 'random_forest':
            base_model = self._build_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)

        base_metrics = self.models[base_model_name]['metrics']
        results.append({
            'location': base_location['name'],
            'lat': base_location['lat'],
            'lon': base_location['lon'],
            'rmse': base_metrics['test_rmse'],
            'r2': base_metrics['test_r2'],
            'training_type': 'base'
        })

        # Test on other locations
        for location in locations_list[1:]:
            print(f"Testing on {location['name']} data...")

            test_analyzer = ComprehensiveSolarForecaster(location['data_path'])
            test_analyzer.load_and_preprocess()
            test_analyzer.prepare_features()

            X = test_analyzer.df[test_analyzer.features].fillna(0)
            y = test_analyzer.df[test_analyzer.target]

            X_scaled = self.scaler.transform(X)
            y_pred = base_model.predict(X_scaled)

            rmse = np.sqrt(mean_squared_error(y, y_pred))
            r2 = r2_score(y, y_pred)

            results.append({
                'location': location['name'],
                'lat': location['lat'],
                'lon': location['lon'],
                'rmse': rmse,
                'r2': r2,
                'training_type': 'transfer'
            })

        results_df = pd.DataFrame(results)
        self._plot_geographic_transferability(results_df)

        return results_df

    def analyze_temporal_resolution(self, resolutions=None):
        """Fixed temporal resolution analysis"""

        if resolutions is None:
            resolutions = [
                {'name': '30min', 'freq': '30min'},
                {'name': 'hourly', 'freq': 'H'},
                {'name': 'daily', 'freq': 'D'},
                {'name': 'weekly', 'freq': 'W'}
            ]

        results = []
        original_df = self.df.copy()

        for resolution in resolutions:
            print(f"Analyzing {resolution['name']} resolution...")

            if resolution['freq'] == 'H':
                self.df = original_df.copy()
            elif resolution['freq'] == '30min':
                self.df = self._interpolate_to_30min_fixed(original_df)
            else:
                self.df = self._resample_data(original_df, resolution['freq'])

            self.prepare_features()
            X_train, X_val, X_test, y_train, y_val, y_test = self.split_data_temporal()

            model = self._build_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)
            metrics = self.models['random_forest']['metrics']

            results.append({
                'resolution': resolution['name'],
                'freq': resolution['freq'],
                'test_rmse': metrics['test_rmse'],
                'test_r2': metrics['test_r2'],
                'sample_count': len(self.df)
            })

        self.df = original_df
        results_df = pd.DataFrame(results)
        self._plot_temporal_resolution_performance(results_df)

        return results_df

    def _interpolate_to_30min_fixed(self, df):
        """Fixed version: Convert hourly data to 30-minute resolution"""
        start_date = df['datetime'].min()
        end_date = df['datetime'].max()
        new_index = pd.date_range(start=start_date, end=end_date, freq='30min')

        df_30min = pd.DataFrame(index=new_index)
        df_30min.index.name = 'datetime'

        df_hourly = df.set_index('datetime')

        # Separate numeric and non-numeric columns
        numeric_cols = df_hourly.select_dtypes(include=[np.number]).columns

        # Only interpolate numeric columns
        df_numeric = df_hourly[numeric_cols]
        df_combined = df_30min.join(df_numeric, how='left')
        df_combined = df_combined.interpolate(method='time')
        df_combined = df_combined.reset_index()

        # Recreate all temporal features using the new datetime index
        self._recreate_temporal_features_for_resampled(df_combined)

        return df_combined

    def _resample_data(self, df, freq):
        """Resample data to different frequency"""
        df_temp = df.copy().set_index('datetime')

        # Only resample numeric columns
        numeric_cols = df_temp.select_dtypes(include=[np.number]).columns
        resampled = df_temp[numeric_cols].resample(freq).mean()
        resampled = resampled.reset_index()

        # Recreate temporal features
        self._recreate_temporal_features_for_resampled(resampled)

        return resampled

    def _recreate_temporal_features_for_resampled(self, df):
        """Recreate temporal features for resampled data"""
        # Basic temporal features
        df['hour'] = df['datetime'].dt.hour + df['datetime'].dt.minute/60  # Handle sub-hourly data
        df['day'] = df['datetime'].dt.day
        df['day_of_week'] = df['datetime'].dt.dayofweek
        df['month'] = df['datetime'].dt.month
        df['year'] = df['datetime'].dt.year
        df['day_of_year'] = df['datetime'].dt.dayofyear

        # Cyclical encoding
        df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
        df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
        df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
        df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year']/365)
        df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year']/365)
        df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
        df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)

        # Season features
        df['season_numeric'] = ((df['month'] % 12) // 3)
        df['is_spring'] = (df['month'].isin([3, 4, 5])).astype(int)
        df['is_summer'] = (df['month'].isin([6, 7, 8])).astype(int)
        df['is_fall'] = (df['month'].isin([9, 10, 11])).astype(int)

        # Binary indicators
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        df['is_winter'] = df['month'].isin([12, 1, 2]).astype(int)

        return df

    def analyze_climate_zones(self, climate_data_paths):
        results = []

        # Train base model
        self.prepare_features()
        X_train, X_val, X_test, y_train, y_val, y_test = self.split_data_temporal()
        base_model = self._build_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)

        for climate_zone, data_path in climate_data_paths.items():
            print(f"Analyzing {climate_zone} climate zone...")

            zone_analyzer = ComprehensiveSolarForecaster(data_path)
            zone_analyzer.load_and_preprocess()
            zone_analyzer.prepare_features()

            X_zone_train, X_zone_val, X_zone_test, y_zone_train, y_zone_val, y_zone_test = zone_analyzer.split_data_temporal()

            # Train zone-specific model
            zone_model = zone_analyzer._build_random_forest(X_zone_train, y_zone_train, X_zone_val, y_zone_val, X_zone_test, y_zone_test)
            zone_metrics = zone_analyzer.models['random_forest']['metrics']

            # Test base model on zone data
            base_pred = base_model.predict(X_zone_test)
            base_rmse = np.sqrt(mean_squared_error(y_zone_test, base_pred))
            base_r2 = r2_score(y_zone_test, base_pred)

            results.append({
                'climate_zone': climate_zone,
                'zone_specific_rmse': zone_metrics['test_rmse'],
                'zone_specific_r2': zone_metrics['test_r2'],
                'base_model_rmse': base_rmse,
                'base_model_r2': base_r2,
                'performance_gap_rmse': base_rmse - zone_metrics['test_rmse'],
                'performance_gap_r2': zone_metrics['test_r2'] - base_r2
            })

        results_df = pd.DataFrame(results)
        self._plot_climate_zone_performance(results_df)

        return results_df

    def incorporate_weather_parameters(self, weather_data_path):
        # Load weather data
        weather_df = pd.read_csv(weather_data_path)
        weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])

        # Merge with solar data
        merged_df = pd.merge(self.df, weather_df, on='datetime', how='inner')
        original_df = self.df.copy()
        self.df = merged_df

        # Train baseline model (without weather)
        self.prepare_features(include_weather=False)
        X_train_base, X_val_base, X_test_base, y_train_base, y_val_base, y_test_base = self.split_data_temporal()
        base_model = self._build_random_forest(X_train_base, y_train_base, X_val_base, y_val_base, X_test_base, y_test_base)
        base_metrics = self.models['random_forest']['metrics']

        # Train enhanced model (with weather)
        self.prepare_features(include_weather=True)
        X_train_enh, X_val_enh, X_test_enh, y_train_enh, y_val_enh, y_test_enh = self.split_data_temporal()
        enhanced_model = self._build_random_forest(X_train_enh, y_train_enh, X_val_enh, y_val_enh, X_test_enh, y_test_enh)
        enhanced_metrics = self.models['random_forest']['metrics']

        # Compare performance
        comparison = {
            'baseline': base_metrics,
            'enhanced': enhanced_metrics,
            'improvement': {
                'rmse': base_metrics['test_rmse'] - enhanced_metrics['test_rmse'],
                'rmse_percent': (base_metrics['test_rmse'] - enhanced_metrics['test_rmse']) / base_metrics['test_rmse'] * 100,
                'r2': enhanced_metrics['test_r2'] - base_metrics['test_r2']
            }
        }

        print(f"\nPerformance Comparison:")
        print(f"Baseline RMSE: {base_metrics['test_rmse']:.4f}")
        print(f"Enhanced RMSE: {enhanced_metrics['test_rmse']:.4f}")
        print(f"Improvement: {comparison['improvement']['rmse']:.4f} ({comparison['improvement']['rmse_percent']:.2f}%)")
        print(f"Baseline R²: {base_metrics['test_r2']:.4f}")
        print(f"Enhanced R²: {enhanced_metrics['test_r2']:.4f}")
        print(f"R² Improvement: {comparison['improvement']['r2']:.4f}")

        self.df = original_df
        self._plot_weather_impact_comparison(comparison)

        return comparison

    def _plot_model_comparison(self, results_df):
        """Plot model comparison results"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))

        # RMSE comparison
        axes[0, 0].bar(results_df['model'], results_df['test_rmse'], color='lightcoral')
        axes[0, 0].set_title('Test RMSE Comparison')
        axes[0, 0].set_ylabel('RMSE')
        axes[0, 0].tick_params(axis='x', rotation=45)

        # R² comparison
        axes[0, 1].bar(results_df['model'], results_df['test_r2'], color='lightblue')
        axes[0, 1].set_title('Test R² Comparison')
        axes[0, 1].set_ylabel('R² Score')
        axes[0, 1].tick_params(axis='x', rotation=45)

        # MAE comparison
        axes[1, 0].bar(results_df['model'], results_df['test_mae'], color='lightgreen')
        axes[1, 0].set_title('Test MAE Comparison')
        axes[1, 0].set_ylabel('MAE')
        axes[1, 0].tick_params(axis='x', rotation=45)

        # Training time
        axes[1, 1].bar(results_df['model'], results_df['training_time'], color='gold')
        axes[1, 1].set_title('Training Time Comparison')
        axes[1, 1].set_ylabel('Time (seconds)')
        axes[1, 1].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig('solar_analysis_results/comparisons/model_comparison.png', dpi=300, bbox_inches='tight')
        plt.close()

    def _plot_geographic_transferability(self, results_df):
        """Plot geographic transferability results"""
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))

        # RMSE by location
        sns.scatterplot(data=results_df, x='lon', y='lat', size='rmse',
                       hue='training_type', ax=axes[0])
        axes[0].set_title('Geographic Performance - RMSE')
        axes[0].set_xlabel('Longitude')
        axes[0].set_ylabel('Latitude')

        # R² by location
        sns.scatterplot(data=results_df, x='lon', y='lat', size='r2',
                       hue='training_type', ax=axes[1])
        axes[1].set_title('Geographic Performance - R²')
        axes[1].set_xlabel('Longitude')
        axes[1].set_ylabel('Latitude')

        plt.tight_layout()
        plt.savefig('solar_analysis_results/comparisons/geographic_transferability.png', dpi=300, bbox_inches='tight')
        plt.close()

    def _plot_temporal_resolution_performance(self, results_df):
        """Plot temporal resolution performance"""
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))

        axes[0].plot(results_df['resolution'], results_df['test_rmse'], 'o-', linewidth=2, markersize=8)
        axes[0].set_title('RMSE vs Temporal Resolution')
        axes[0].set_ylabel('Test RMSE')
        axes[0].tick_params(axis='x', rotation=45)

        axes[1].plot(results_df['resolution'], results_df['test_r2'], 'o-', linewidth=2, markersize=8, color='green')
        axes[1].set_title('R² vs Temporal Resolution')
        axes[1].set_ylabel('Test R²')
        axes[1].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig('solar_analysis_results/comparisons/temporal_resolution.png', dpi=300, bbox_inches='tight')
        plt.close()

    def _plot_climate_zone_performance(self, results_df):
        """Plot climate zone performance"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # Zone-specific vs base model RMSE
        x = np.arange(len(results_df))
        width = 0.35

        axes[0, 0].bar(x - width/2, results_df['zone_specific_rmse'], width, label='Zone-specific', alpha=0.8)
        axes[0, 0].bar(x + width/2, results_df['base_model_rmse'], width, label='Base model', alpha=0.8)
        axes[0, 0].set_title('RMSE Comparison by Climate Zone')
        axes[0, 0].set_ylabel('RMSE')
        axes[0, 0].set_xticks(x)
        axes[0, 0].set_xticklabels(results_df['climate_zone'], rotation=45)
        axes[0, 0].legend()

        # R² comparison
        axes[0, 1].bar(x - width/2, results_df['zone_specific_r2'], width, label='Zone-specific', alpha=0.8)
        axes[0, 1].bar(x + width/2, results_df['base_model_r2'], width, label='Base model', alpha=0.8)
        axes[0, 1].set_title('R² Comparison by Climate Zone')
        axes[0, 1].set_ylabel('R² Score')
        axes[0, 1].set_xticks(x)
        axes[0, 1].set_xticklabels(results_df['climate_zone'], rotation=45)
        axes[0, 1].legend()

        # Performance gaps
        axes[1, 0].bar(results_df['climate_zone'], results_df['performance_gap_rmse'])
        axes[1, 0].set_title('RMSE Performance Gap')
        axes[1, 0].set_ylabel('Gap (Base - Zone-specific)')
        axes[1, 0].tick_params(axis='x', rotation=45)

        axes[1, 1].bar(results_df['climate_zone'], results_df['performance_gap_r2'])
        axes[1, 1].set_title('R² Performance Gap')
        axes[1, 1].set_ylabel('Gap (Zone-specific - Base)')
        axes[1, 1].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig('solar_analysis_results/comparisons/climate_zones.png', dpi=300, bbox_inches='tight')
        plt.close()

    def _plot_weather_impact_comparison(self, comparison):
        """Plot weather parameter impact"""
        metrics = ['test_rmse', 'test_r2', 'test_mae']
        baseline_values = [comparison['baseline'][m] for m in metrics]
        enhanced_values = [comparison['enhanced'][m] for m in metrics]

        x = np.arange(len(metrics))
        width = 0.35

        fig, ax = plt.subplots(figsize=(10, 6))
        ax.bar(x - width/2, baseline_values, width, label='Baseline', alpha=0.8)
        ax.bar(x + width/2, enhanced_values, width, label='With Weather', alpha=0.8)

        ax.set_title('Impact of Weather Parameters on Model Performance')
        ax.set_ylabel('Metric Value')
        ax.set_xticks(x)
        ax.set_xticklabels(['RMSE', 'R²', 'MAE'])
        ax.legend()

        plt.tight_layout()
        plt.savefig('solar_analysis_results/comparisons/weather_impact.png', dpi=300, bbox_inches='tight')
        plt.close()

    def _generate_model_comparison_report(self, results_df):
        """Generate comprehensive model comparison report"""
        report_content = []

        report_content.extend([
            "="*80,
            "COMPREHENSIVE MODEL COMPARISON REPORT",
            "="*80,
            f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            ""
        ])

        # Best performing model
        best_r2_model = results_df.loc[results_df['test_r2'].idxmax()]
        best_rmse_model = results_df.loc[results_df['test_rmse'].idxmin()]

        report_content.extend([
            "PERFORMANCE RANKING",
            "-"*40,
            f"Best R² Score: {best_r2_model['model']} ({best_r2_model['test_r2']:.4f})",
            f"Best RMSE: {best_rmse_model['model']} ({best_rmse_model['test_rmse']:.4f})",
            ""
        ])

        # Detailed comparison
        report_content.extend([
            "DETAILED COMPARISON",
            "-"*40
        ])

        for _, row in results_df.iterrows():
            report_content.extend([
                f"{row['model'].upper()}:",
                f"  RMSE: {row['test_rmse']:.4f}",
                f"  R²: {row['test_r2']:.4f}",
                f"  MAE: {row['test_mae']:.4f}",
                f"  Training Time: {row['training_time']:.2f}s",
                ""
            ])

        # Save report
        with open('solar_analysis_results/comparisons/model_comparison_report.txt', 'w') as f:
            f.write('\n'.join(report_content))

        print('\n'.join(report_content))

    def generate_comprehensive_report(self):
        print("\nGenerating comprehensive framework report...")

        report_content = [
            "COMPREHENSIVE SOLAR FORECASTING FRAMEWORK REPORT",
            "="*80,
            f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        ]

        with open('solar_analysis_results/comprehensive_framework_report.txt', 'w') as f:
            f.write('\n'.join(report_content))

        print('\n'.join(report_content))

def main():
    """Main execution function"""
    print("="*80)
    print("COMPREHENSIVE SOLAR FORECASTING FRAMEWORK")
    print("="*80)

    # Initialize framework
    data_path = 'solar_data.csv'  # Update path as needed
    forecaster = ComprehensiveSolarForecaster(data_path)

    try:
        print("\nSTEP 1: DATA LOADING AND PREPROCESSING")
        forecaster.load_and_preprocess()

        print("\nSTEP 2: COMPREHENSIVE MODEL COMPARISON")
        model_results = forecaster.compare_models()

        # Step 3: Test geographic transferability (example)
        # locations = [
        #     {'name': 'New Delhi', 'lat': 28.6, 'lon': 77.2, 'data_path': 'delhi_data.csv'},
        #     {'name': 'Mumbai', 'lat': 19.1, 'lon': 72.9, 'data_path': 'mumbai_data.csv'},
        # ]

        print("\nSTEP 4: TEMPORAL RESOLUTION ANALYSIS")
        temporal_results = forecaster.analyze_temporal_resolution()

        print("\nSTEP 6: COMPREHENSIVE REPORT GENERATION")
        forecaster.generate_comprehensive_report()

        print("\n" + "="*80)
        print("FRAMEWORK ANALYSIS COMPLETED SUCCESSFULLY!")
        print("="*80)
        print("Results saved in 'solar_analysis_results/' directory")

    except Exception as e:
        print(f"\nError: {str(e)}")
        print("Please check your data files and paths.")

if __name__ == "__main__":
    main()

COMPREHENSIVE SOLAR FORECASTING FRAMEWORK

STEP 1: DATA LOADING AND PREPROCESSING
Loading and preprocessing solar irradiance data...
Creating comprehensive temporal features...
Dataset shape: (8760, 44)
Date range: 2019-01-01 00:00:00 to 2019-12-31 23:00:00
Total duration: 364 days
Years covered: [np.int32(2019)]
Mean solar irradiance: 172.66 W/m²
Max solar irradiance: 1003.02 W/m²

STEP 2: COMPREHENSIVE MODEL COMPARISON

COMPREHENSIVE MODEL COMPARISON
Preparing features for modeling...
Added weather features: ['cloud_cover', 'relative_humidity', 'wind_speed', 'air_pressure', 'visibility', 'precipitation', 'temperature']
Total features: 31
Training: 5256 | Validation: 1752 | Test: 1752

Training RANDOM_FOREST model...

Training XGBOOST model...

Training SVM model...

Training GRADIENT_BOOSTING model...

Training NEURAL_NETWORK model...
COMPREHENSIVE MODEL COMPARISON REPORT
Analysis Date: 2025-06-08 07:30:11

PERFORMANCE RANKING
----------------------------------------
Best R² Score: r