## Cleaning Data

### Display raw data

In [None]:
import pandas as pd

data = pd.read_csv('data/total-fish-stocks.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.columns

### Basic cleaning

In [None]:
# Define the threshold (30% valid data or less than 70% NaN or 0)
threshold = 0.3 * len(data)

# Filter columns based on the threshold
data_cleaned = data.loc[:, (data != 0).sum() + data.notna().sum() - data.isna().sum() > threshold]

In [None]:
data.columns

In [None]:
# Step 1: Remove duplicate or potentially misspelled columns
data = data.rename(columns={
    'biomass_relative_to-preferred_management_rate': 'biomass_relative_to_preferred_management_rate'
})

# Step 2: Remove "general" columns if they are aggregations of detailed metrics
data = data.drop(columns=['general_total_biomass', 'general_total_catch', 'general_exploitation_rate'], errors='ignore')

# Step 3: Select one baseline for "relative to" metrics
columns_to_keep = [
    'biomass_relative_to_msy', 'total_biomass_relative_to_msy', 'spawning_stock_relative_to_msy',
    'catch_relative_to_msy', 'fishing_mortality_relative_to_msy'
]
relative_columns = [col for col in data.columns if '_relative_to_' in col]
columns_to_drop = [col for col in relative_columns if col not in columns_to_keep]
data = data.drop(columns=columns_to_drop, errors='ignore')

# Step 4: Ensure columns are distinct and meaningful
data = data.drop(columns=['total_landings'], errors='ignore')  # Assuming `total_catch` is enough
data = data.drop(columns=['catch_relative_to_mean_catch'], errors='ignore')  # Likely less relevant
data = data.drop(columns=['recruits'], errors='ignore')  # Highly specific, might not be necessary

# Step 5: Keep only meaningful effort and biomass metrics
data = data.drop(columns=['survey_biomass'], errors='ignore')  # If less relevant than `total_biomass`
data = data.drop(columns=['catch_per_unit_effort'], errors='ignore')  # If `fishing_effort` suffices

# Print final columns for review
print("Modified Columns:")
print(data.columns.tolist())

In [None]:
# Example: Summing all numeric columns for each year
data_grouped = data.groupby('Year').sum(numeric_only=True).reset_index()

# Sort the grouped DataFrame by 'Year' in ascending order
data_grouped = data_grouped.sort_values(by='Year', ascending=True)

data_grouped.to_csv('clean1.csv',index=False)


In [None]:
import pandas as pd

# Load the dataset
file_path = 'clean1.csv'
df = pd.read_csv(file_path)

# Calculate the percentage of zeros in each column
zero_percentage = (df == 0).sum() / len(df) * 100

# Identify columns where zeros are less than or equal to 70%
columns_to_keep = zero_percentage[zero_percentage <= 70].index.tolist()

# Keep only these columns in the DataFrame
df_cleaned = df[columns_to_keep]

# Save or display the result
print("Columns retained in the cleaned DataFrame:")
print(columns_to_keep)


In [None]:
df_cleaned.to_csv('clean2.csv',index=False)

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'clean2.csv'
df = pd.read_csv(file_path)

# Define a function to determine if a column is discrete
def is_discrete(column):
    if pd.api.types.is_integer_dtype(column):
        return True
    if column.nunique() < 20:
        return True
    # Check if the column is a float but has integer-like values (e.g., 1.0, 2.0)
    if column.dtype == 'float' and (column == column.astype(int)).all():
        return True
    return False

# Iterate through the columns and fill values accordingly
for col in df.columns:
    if col == 'Year':  # Skip the Year column as it's temporal
        continue
    
    if is_discrete(df[col]):
        # Fill missing or zero values with the median for discrete variables
        df[col] = df[col].replace(0, np.nan)  # Replace zeros with NaN if necessary
        df[col] = df[col].fillna(df[col].median())
        # Convert to integer if values are discrete (like 1.0, 2.0)
        if df[col].dtype == 'float' and (df[col] == df[col].astype(int)).all():
            df[col] = df[col].astype(int)
        print(f'int: {col}')
    else:
        # Fill missing or zero values with the mean for continuous variables
        df[col] = df[col].replace(0, np.nan)  # Replace zeros with NaN if necessary
        df[col] = df[col].fillna(df[col].mean())
        print(f'float: {col}')

# Save or display the cleaned DataFrame
print("Cleaned DataFrame with missing values filled:")
df.head()

In [None]:
df.tail()

In [None]:
df.to_csv('clean3.csv',index=False)

In [None]:
threshold = int(len(df)*0.8)
train = df.iloc[:threshold]
test = df.iloc[threshold:]

train.to_csv('train.csv')
test.to_csv('test.csv')

## Prepare Feature and Train Model

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import cross_val_score, train_test_split, KFold
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# import matplotlib.pyplot as plt

# # Load the train and test data
# train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')

# # Check the structure of the training data
# print(train_df.head())

# # Preprocess the data
# # Let's assume 'Year' is the time series index, and we want to predict 'total_biomass'
# X_train = train_df.drop(columns=['total_biomass'])
# y_train = train_df['total_biomass']

# X_test = test_df.drop(columns=['total_biomass'])
# y_test = test_df['total_biomass']

# # Optionally, create lag features or other time series features
# # For simplicity, we will use the original features, but you can engineer new ones if necessary

# # Train a RandomForest model with 5-fold cross-validation
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# # 5-fold cross-validation
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # Perform cross-validation and print the mean score (negative mean squared error)
# cv_scores = cross_val_score(rf_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
# print(f"5-Fold Cross-Validation MSE: {-cv_scores.mean()}")

# # Train the model on the full training data
# rf_model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = rf_model.predict(X_test)

# # Calculate the Mean Squared Error (MSE) for the predictions
# mse = mean_squared_error(y_test, y_pred)
# print(f"Test Set Mean Squared Error: {mse}")

# # Plot the actual vs predicted values for the test set
# plt.figure(figsize=(10, 6))
# plt.plot(test_df['Year'], y_test, label='Actual')
# plt.plot(test_df['Year'], y_pred, label='Predicted')
# plt.xlabel('Year')
# plt.ylabel('Total Biomass')
# plt.legend()
# plt.title('Total Biomass Prediction (Actual vs Predicted)')
# plt.show()

# # Future predictions (for example, predicting the next 5 years)
# future_years = np.array([2017, 2018, 2019, 2020, 2021]).reshape(-1, 1)
# # We need to create the features for these future years (e.g., using the last year data)
# # Assuming you have an extrapolation method, here I'm using simple dummy values for future predictions
# future_data = np.hstack([future_years, np.zeros((future_years.shape[0], X_train.shape[1] - 1))])

# # Predict future biomass values
# future_predictions = rf_model.predict(future_data)
# print("Predicted Total Biomass for Future Years:")
# for year, prediction in zip(future_years.flatten(), future_predictions):
#     print(f"Year {year}: {prediction}")


In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_squared_error, r2_score
# import lightgbm as lgb
# from statsmodels.tsa.statespace.sarimax import SARIMAX
# from sklearn.model_selection import TimeSeriesSplit
# import matplotlib.pyplot as plt
# from datetime import datetime
# import warnings
# warnings.filterwarnings('ignore')

# class TimeSeriesForecaster:
#     def __init__(self, seasonality=12):
#         self.seasonality = seasonality
#         self.scaler = StandardScaler()
#         self.models = {}
        
#     def create_features(self, df, target_col):
#         """Create time series features from datetime index."""
#         df = df.copy()
#         df['year'] = df['Year'].astype(int)
        
#         # Lag features
#         for lag in range(1, 4):
#             df[f'lag_{lag}'] = df[target_col].shift(lag)
            
#         # Rolling mean features
#         for window in [3, 6, 12]:
#             df[f'rolling_mean_{window}'] = df[target_col].rolling(
#                 window=window, min_periods=1).mean()
#             df[f'rolling_std_{window}'] = df[target_col].rolling(
#                 window=window, min_periods=1).std()
            
#         # Trend feature
#         df['trend'] = np.arange(len(df))
        
#         return df
    
#     def prepare_data(self, train_df, test_df, target_col):
#         """Prepare data for modeling using separate train and test files."""
#         # Create features for both train and test
#         train_featured = self.create_features(train_df, target_col)
#         test_featured = self.create_features(test_df, target_col)
        
#         # Prepare features and target
#         feature_cols = [col for col in train_featured.columns 
#                        if col not in [target_col, 'Year']]
        
#         X_train = train_featured[feature_cols].fillna(method='ffill')
#         y_train = train_featured[target_col]
#         X_test = test_featured[feature_cols].fillna(method='ffill')
#         y_test = test_featured[target_col]
        
#         # Scale features
#         X_train_scaled = self.scaler.fit_transform(X_train)
#         X_test_scaled = self.scaler.transform(X_test)
        
#         return (X_train_scaled, y_train, X_test_scaled, y_test,
#                 train_featured, test_featured, feature_cols)
    
#     def train_lightgbm(self, X_train, y_train):
#         """Train LightGBM model with time series specific parameters."""
#         params = {
#             'objective': 'regression',
#             'metric': 'rmse',
#             'boosting_type': 'gbdt',
#             'num_leaves': 31,
#             'learning_rate': 0.005,
#             'feature_fraction': 0.9,
#             'num_iterations': 10000,
#             'verbose': -1
#         }
        
#         # Create dataset
#         lgb_train = lgb.Dataset(X_train, y_train)
        
#         # Train model
#         self.models['lgb'] = lgb.train(
#             params,
#             lgb_train,
#             valid_sets=[lgb_train]
#         )
    
#     def train_sarima(self, y_train):
#         """Train SARIMA model."""
#         self.models['sarima'] = SARIMAX(
#             y_train,
#             order=(1, 1, 1),
#             seasonal_order=(1, 1, 1, self.seasonality)
#         ).fit(disp=False)
    
#     def ensemble_predict(self, X_test):
#         """Make predictions using ensemble of models."""
#         # LightGBM predictions
#         lgb_preds = self.models['lgb'].predict(X_test)
        
#         # SARIMA predictions
#         sarima_preds = self.models['sarima'].forecast(len(X_test))
        
#         # Ensemble predictions (simple average)
#         ensemble_preds = (lgb_preds + sarima_preds) / 2
        
#         return ensemble_preds
    
#     def plot_results(self, train_data, test_data, predictions, target_col):
#         """Plot actual vs predicted values."""
#         plt.figure(figsize=(12, 6))
#         plt.plot(train_data['Year'], train_data[target_col], 
#                 label='Training Data', color='blue')
#         plt.plot(test_data['Year'], test_data[target_col], 
#                 label='Actual Test Data', color='green')
#         plt.plot(test_data['Year'], predictions, 
#                 label='Predictions', color='red', linestyle='--')
#         plt.title('Time Series Forecasting Results')
#         plt.xlabel('Year')
#         plt.ylabel(target_col)
#         plt.legend()
#         plt.grid(True)
#         plt.show()
    
#     def fit_predict(self, train_df, test_df, target_col):
#         """Full training and prediction pipeline."""
#         # Prepare data
#         (X_train_scaled, y_train, X_test_scaled, y_test,
#          train_data, test_data, feature_cols) = self.prepare_data(
#             train_df, test_df, target_col)
        
#         # Train models
#         self.train_lightgbm(X_train_scaled, y_train)
#         self.train_sarima(y_train)
        
#         # Make predictions
#         predictions = self.ensemble_predict(X_test_scaled)
        
#         # Calculate metrics
#         mse = mean_squared_error(y_test, predictions)
#         r2 = r2_score(y_test, predictions)
        
#         print(f"Mean Squared Error: {mse:.2f}")
#         print(f"R² Score: {r2:.2f}")
        
#         # Plot results
#         self.plot_results(train_data, test_data, predictions, target_col)
        
#         return predictions
    
#     def forecast_future(self, train_df, periods=5):
#         """Forecast future values."""
#         last_data = train_df.iloc[-1:]
#         future_dates = pd.DataFrame({
#             'Year': range(
#                 int(last_data['Year'].values[0]) + 1,
#                 int(last_data['Year'].values[0]) + periods + 1
#             )
#         })
        
#         # Add target column to future dates for feature creation
#         future_dates['total_biomass'] = np.nan
        
#         # Combine historical and future data
#         full_data = pd.concat([train_df, future_dates], ignore_index=True)
        
#         # Create features for all data
#         featured_data = self.create_features(full_data, 'total_biomass')
        
#         # Get predictions for future dates
#         future_features = featured_data.iloc[-periods:].copy()
#         feature_cols = [col for col in featured_data.columns 
#                        if col not in ['total_biomass', 'Year']]
        
#         X_future = future_features[feature_cols].fillna(method='ffill')
#         X_future_scaled = self.scaler.transform(X_future)
        
#         predictions = self.ensemble_predict(X_future_scaled)
        
#         return future_dates['Year'], predictions

# # Example usage:
# if __name__ == "__main__":
#     # Load data
#     train_df = pd.read_csv('train.csv')
#     test_df = pd.read_csv('test.csv')
    
#     # Initialize and train the model
#     forecaster = TimeSeriesForecaster(seasonality=12)
#     predictions = forecaster.fit_predict(train_df, test_df, 'total_biomass')
    
#     # Make future predictions
#     future_years, future_predictions = forecaster.forecast_future(train_df, periods=5)
#     print("\nFuture Predictions:")
#     for year, pred in zip(future_years, future_predictions):
#         print(f"Year {year}: {pred:.2f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class TimeSeriesForecaster:
    def __init__(self, seasonality=12):
        self.seasonality = seasonality
        self.scaler = RobustScaler()
        self.models = {}

    def plot_results(self, train_data, test_data, predictions, target_col):
        """Plot actual vs predicted values."""
        plt.figure(figsize=(12, 6))
        
        # Plot training data
        plt.plot(train_data['Year'], train_data[target_col], 
                label='Training Data', color='blue', alpha=0.7)
        
        # Plot test data
        plt.plot(test_data['Year'], test_data[target_col], 
                label='Actual Test Data', color='green', alpha=0.7)
        
        # Plot predictions
        plt.plot(test_data['Year'], predictions, 
                label='Predictions', color='red', 
                linestyle='--', alpha=0.8)
        
        # Customize plot
        plt.title('Time Series Forecasting Results', fontsize=14, pad=20)
        plt.xlabel('Year', fontsize=12)
        plt.ylabel(target_col, fontsize=12)
        plt.legend(fontsize=10)
        plt.grid(True, alpha=0.3)
        
        # Rotate x-axis labels for better readability
        plt.xticks(rotation=45)
        
        # Adjust layout to prevent label cutoff
        plt.tight_layout()
        
        plt.show()
    
    def create_features(self, df, target_col):
        """Create enhanced time series features from datetime index."""
        df = df.copy()
        df['year'] = df['Year'].astype(int)
        
        # Enhanced lag features
        for lag in range(1, 7):
            df[f'lag_{lag}'] = df[target_col].shift(lag)
        
        # Enhanced rolling statistics
        for window in [3, 6, 12, 24]:
            df[f'rolling_mean_{window}'] = df[target_col].rolling(
                window=window, min_periods=1).mean()
            df[f'rolling_std_{window}'] = df[target_col].rolling(
                window=window, min_periods=1).std()
            df[f'rolling_min_{window}'] = df[target_col].rolling(
                window=window, min_periods=1).min()
            df[f'rolling_max_{window}'] = df[target_col].rolling(
                window=window, min_periods=1).max()
            
        # Exponential moving averages
        for span in [3, 6, 12]:
            df[f'ema_{span}'] = df[target_col].ewm(span=span, adjust=False).mean()
        
        # Trend and cyclical features
        df['trend'] = np.arange(len(df))
        df['trend_squared'] = df['trend'] ** 2
        df['year_mod'] = df['year'] % self.seasonality
        
        # Difference features
        df['diff_1'] = df[target_col].diff()
        df['diff_2'] = df[target_col].diff().diff()
        
        return df
    
    def prepare_data(self, train_df, test_df, target_col):
        """Prepare data with enhanced preprocessing."""
        # Create features
        train_featured = self.create_features(train_df, target_col)
        test_featured = self.create_features(test_df, target_col)
        
        # Prepare features and target
        feature_cols = [col for col in train_featured.columns 
                       if col not in [target_col, 'Year']]
        
        X_train = train_featured[feature_cols].copy()
        y_train = train_featured[target_col]
        X_test = test_featured[feature_cols].copy()
        y_test = test_featured[target_col]
        
        # Handle missing values more robustly
        for col in X_train.columns:
            X_train[col] = X_train[col].fillna(X_train[col].median())
            X_test[col] = X_test[col].fillna(X_test[col].median())
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        return (X_train_scaled, y_train, X_test_scaled, y_test,
                train_featured, test_featured, feature_cols)
    
    def train_lightgbm(self, X_train, y_train):
        """Train LightGBM with optimized parameters."""
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 20,
            'learning_rate': 0.0001,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'num_iterations': 5000,
            'early_stopping_rounds': 100,
            'verbose': -1,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'min_child_samples': 20
        }
        
        # Create dataset with validation
        tscv = TimeSeriesSplit(n_splits=5)
        for train_idx, val_idx in tscv.split(X_train):
            X_train_split, X_val = X_train[train_idx], X_train[val_idx]
            y_train_split, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
        lgb_train = lgb.Dataset(X_train_split, y_train_split)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
        
        # Train model
        self.models['lgb'] = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            valid_names=['train', 'valid']
        )
    
    def train_sarima(self, y_train):
        """Train SARIMA with optimized parameters."""
        # Grid search for best parameters
        best_aic = float('inf')
        best_order = None
        best_seasonal_order = None
        
        for p in range(2):
            for d in range(2):
                for q in range(2):
                    for P in range(2):
                        for D in range(2):
                            for Q in range(2):
                                try:
                                    model = SARIMAX(
                                        y_train,
                                        order=(p, d, q),
                                        seasonal_order=(P, D, Q, self.seasonality)
                                    ).fit(disp=False)
                                    if model.aic < best_aic:
                                        best_aic = model.aic
                                        best_order = (p, d, q)
                                        best_seasonal_order = (P, D, Q, self.seasonality)
                                except:
                                    continue
        
        # Train final model with best parameters
        self.models['sarima'] = SARIMAX(
            y_train,
            order=best_order if best_order else (1, 1, 1),
            seasonal_order=best_seasonal_order if best_seasonal_order else (1, 1, 1, self.seasonality)
        ).fit(disp=False)
    
    def ensemble_predict(self, X_test):
        """Make weighted ensemble predictions."""
        # Get predictions from both models
        lgb_preds = self.models['lgb'].predict(X_test)
        sarima_preds = self.models['sarima'].forecast(len(X_test))
        
        # Use weighted average (giving more weight to LightGBM as it handles features better)
        ensemble_preds = 0.7 * lgb_preds + 0.3 * sarima_preds
        
        return ensemble_preds

    def fit_predict(self, train_df, test_df, target_col):
        """Full training and prediction pipeline with error handling."""
        try:
            # Prepare data
            (X_train_scaled, y_train, X_test_scaled, y_test,
             train_data, test_data, feature_cols) = self.prepare_data(
                train_df, test_df, target_col)
            
            # Train models
            self.train_lightgbm(X_train_scaled, y_train)
            self.train_sarima(y_train)
            
            # Make predictions
            predictions = self.ensemble_predict(X_test_scaled)
            
            # Calculate metrics
            mse = mean_squared_error(y_test, predictions)
            r2 = r2_score(y_test, predictions)
            
            print(f"Mean Squared Error: {mse:.2f}")
            print(f"R² Score: {r2:.2f}")
            
            # Plot results
            self.plot_results(train_data, test_data, predictions, target_col)
            
            return predictions
            
        except Exception as e:
            print(f"Error in fit_predict: {str(e)}")
            raise

In [None]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Initialize and train the model
forecaster = TimeSeriesForecaster(seasonality=12)
predictions = forecaster.fit_predict(train_df, test_df, 'total_biomass')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Example: Load your data
df = pd.read_csv('clean3.csv')

# Drop 'Year' column
df_without_year = df.drop(columns=['Year'])

# Calculate the correlation matrix
correlation_matrix = df_without_year.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix Excluding Year')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class TimeSeriesForecaster:
    def __init__(self, seasonality=12):
        self.seasonality = seasonality
        self.robust_scaler = RobustScaler()
        self.lstm_scaler = MinMaxScaler(feature_range=(0, 1))
        self.models = {}

    def plot_results(self, train_data, test_data, predictions, target_col):
        """Plot actual vs predicted values."""
        plt.figure(figsize=(12, 6))
        plt.plot(train_data['Year'], train_data[target_col], label='Training Data', color='blue', alpha=0.7)
        plt.plot(test_data['Year'], test_data[target_col], label='Actual Test Data', color='green', alpha=0.7)
        plt.plot(test_data['Year'], predictions, label='Predictions', color='red', linestyle='--', alpha=0.8)
        plt.title('Time Series Forecasting Results', fontsize=14, pad=20)
        plt.xlabel('Year', fontsize=12)
        plt.ylabel(target_col, fontsize=12)
        plt.legend(fontsize=10)
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    def create_features(self, df, target_col):
        """Create enhanced time series features."""
        df = df.copy()
        df['year'] = df['Year'].astype(int)
        for lag in range(1, 7):
            df[f'lag_{lag}'] = df[target_col].shift(lag)
        for window in [3, 6, 12]:
            df[f'rolling_mean_{window}'] = df[target_col].rolling(window=window, min_periods=1).mean()
            df[f'rolling_std_{window}'] = df[target_col].rolling(window=window, min_periods=1).std()
        df['trend'] = np.arange(len(df))
        df['year_mod'] = df['year'] % self.seasonality
        df['diff_1'] = df[target_col].diff()
        return df

    def prepare_data(self, train_df, test_df, target_col):
        """Prepare data for training and testing."""
        train_featured = self.create_features(train_df, target_col)
        test_featured = self.create_features(test_df, target_col)
        feature_cols = [col for col in train_featured.columns if col not in [target_col, 'Year']]
        X_train = train_featured[feature_cols].fillna(train_featured[feature_cols].median())
        y_train = train_featured[target_col]
        X_test = test_featured[feature_cols].fillna(test_featured[feature_cols].median())
        y_test = test_featured[target_col]
        X_train_scaled = self.robust_scaler.fit_transform(X_train)
        X_test_scaled = self.robust_scaler.transform(X_test)
        return X_train_scaled, y_train, X_test_scaled, y_test, train_featured, test_featured, feature_cols

    def prepare_lstm_data(self, X, y, time_steps=3):
        """Prepare data for LSTM."""
        X_lstm, y_lstm = [], []
        for i in range(time_steps, len(X)):
            X_lstm.append(X[i-time_steps:i, :])
            y_lstm.append(y[i])
        return np.array(X_lstm), np.array(y_lstm)

    def train_lightgbm(self, X_train, y_train):
        """Train LightGBM model."""
        params = {
            'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt',
            'num_leaves': 20, 'learning_rate': 0.01, 'feature_fraction': 0.8,
            'bagging_fraction': 0.8, 'bagging_freq': 5, 'num_iterations': 5000,
            'early_stopping_rounds': 100, 'verbose': -1
        }
        tscv = TimeSeriesSplit(n_splits=5)
        for train_idx, val_idx in tscv.split(X_train):
            X_train_split, X_val = X_train[train_idx], X_train[val_idx]
            y_train_split, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        lgb_train = lgb.Dataset(X_train_split, y_train_split)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
        self.models['lgb'] = lgb.train(
            params, lgb_train, valid_sets=[lgb_train, lgb_val], valid_names=['train', 'valid']
        )

    def train_sarima(self, y_train):
        """Train SARIMA model."""
        best_aic = float('inf')
        best_order = None
        best_seasonal_order = None
        for p in range(2):
            for d in range(2):
                for q in range(2):
                    for P in range(2):
                        for D in range(2):
                            for Q in range(2):
                                try:
                                    model = SARIMAX(
                                        y_train, order=(p, d, q),
                                        seasonal_order=(P, D, Q, self.seasonality)
                                    ).fit(disp=False)
                                    if model.aic < best_aic:
                                        best_aic = model.aic
                                        best_order = (p, d, q)
                                        best_seasonal_order = (P, D, Q, self.seasonality)
                                except:
                                    continue
        self.models['sarima'] = SARIMAX(
            y_train, order=best_order, seasonal_order=best_seasonal_order
        ).fit(disp=False)

    def train_lstm(self, X_train, y_train, time_steps=3):
        """Train LSTM model."""
        X_train_lstm, y_train_lstm = self.prepare_lstm_data(X_train, y_train, time_steps)
        model = Sequential([
            LSTM(20, return_sequences=False, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
            Dropout(0.2), Dense(10, activation='relu'), Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse')
        early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
        model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32, verbose=1, callbacks=[early_stopping])
        self.models['lstm'] = model

    def predict_lstm(self, X_test, time_steps=3):
        """Make predictions with LSTM."""
        X_test_lstm, _ = self.prepare_lstm_data(X_test, np.zeros(len(X_test)))
        return self.models['lstm'].predict(X_test_lstm).flatten()

    def fit_predict(self, train_df, test_df, target_col):
        """Complete pipeline for training and prediction."""
        try:
            X_train_scaled, y_train, X_test_scaled, y_test, train_data, test_data, _ = \
                self.prepare_data(train_df, test_df, target_col)
            X_train_lstm = self.lstm_scaler.fit_transform(X_train_scaled)
            y_train_lstm = self.lstm_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
            X_test_lstm = self.lstm_scaler.transform(X_test_scaled)
            self.train_lightgbm(X_train_scaled, y_train)
            self.train_sarima(y_train)
            self.train_lstm(X_train_lstm, y_train_lstm)
            lgb_preds = self.models['lgb'].predict(X_test_scaled)
            sarima_preds = self.models['sarima'].forecast(len(y_test))
            lstm_preds = self.predict_lstm(X_test_lstm)
            predictions = 0.5 * lgb_preds + 0.3 * sarima_preds + 0.2 * lstm_preds
            mse = mean_squared_error(y_test, predictions)
            r2 = r2_score(y_test, predictions)
            print(f"Mean Squared Error: {mse:.2f}")
            print(f"R² Score: {r2:.2f}")
            self.plot_results(train_data, test_data, predictions, target_col)
            return predictions
        except Exception as e:
            print(f"Error in fit_predict: {str(e)}")
            raise

In [None]:
# Example usage
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

forecaster = TimeSeriesForecaster(seasonality=12)
predictions = forecaster.fit_predict(train_df, test_df, target_col='target_column')