<a href="https://colab.research.google.com/github/hellomada/RayField-Systems-Internship-Week-2/blob/notebooks/main_pipeline_for_use.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Create folders if they don't exist
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("diagrams", exist_ok=True)

# Move files to correct folders
!mv cleaned_data.csv data/
!mv ensemble_model.pkl models/


In [2]:
from final_ai_module import load_data, feature_engineering, evaluate_model, load_model
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [3]:
import importlib
import final_ai_module
importlib.reload(final_ai_module)


<module 'final_ai_module' from '/content/final_ai_module.py'>

In [13]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from final_ai_module import load_data, feature_engineering, evaluate_model, load_model
from sklearn.model_selection import train_test_split
from final_ai_module import train_model



ImportError: cannot import name 'train_model' from 'final_ai_module' (/content/final_ai_module.py)

In [20]:
"""
SOLAR POWER FORECASTING PIPELINE
Delivers AC power output predictions with visualization and automated reporting
"""

# -*- coding: utf-8 -*-
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# ========== Deliverable 7: Reusable Functions (ai_module.py) ==========
def load_clean_data(filepath):
    """Deliverable 1.1: Data loading function"""
    df = pd.read_csv(filepath)
    return df.dropna()

def engineer_features(df):
    """Deliverable 2.1/3: Feature engineering (fixed version)"""
    # Create copy of relevant columns first
    X = df[['hour', 'dc_power', 'daily_yield', 'total_yield']].copy()

    # Add engineered features - now modifying the copy directly
    X['rolling_4h_power'] = df['dc_power'].rolling(4).mean().fillna(0)
    X['yield_ratio'] = df['daily_yield'] / (df['total_yield'].replace(0, 1e-6))  # More robust zero-division handling

    y = df['ac_power'].copy()
    return X, y

def train_ensemble(X_train, y_train):
    """Deliverable 2.2/5: Model training with tuning"""
    # Base models
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    gbr = GradientBoostingRegressor(random_state=42)
    lr = LinearRegression()

    # Deliverable 4: Hyperparameter tuning for one model
    param_grid = {'n_estimators': [50, 100, 150]}
    gbr = GridSearchCV(gbr, param_grid, cv=3).fit(X_train, y_train).best_estimator_

    # Deliverable 2: Ensemble model
    ensemble = VotingRegressor([
        ('rf', rf),
        ('gbr', gbr),
        ('lr', lr)
    ])
    return ensemble.fit(X_train, y_train)

def generate_summary(model, X_test, y_test):
    """Deliverable 10/11: Automated summary generation"""
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    summary = f"""SOLAR FARM PERFORMANCE SUMMARY (Generated)
----------------------------------------
Model Performance:
- Mean Squared Error: {mse:.2f}
- R² Score: {r2:.2f}

Key Statistics:
- Avg Predicted Output: {preds.mean():.2f} kW
- Max Predicted Output: {preds.max():.2f} kW
- Min Predicted Output: {preds.min():.2f} kW

Recommended Actions:
- Review periods with output < 50% of max
- Check sensor calibration when predictions diverge from actuals
"""
    return summary

# ========== Deliverable 12: Main Pipeline ==========
def main():
    print("=== SOLAR POWER FORECASTING PIPELINE ===")

    # 1. Data Loading (Deliverable 1)
    print("\n🔍 Loading data...")
    df = load_clean_data("data/cleaned_data.csv")
    print(f"✅ Loaded {len(df)} records")

    # 2. Feature Engineering (Deliverable 3)
    print("\n🛠 Feature engineering...")
    X, y = engineer_features(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 3. Model Training (Deliverable 2/4)
    print("\n🤖 Training ensemble model...")
    os.makedirs("models", exist_ok=True)  # Ensure models directory exists
    model = train_ensemble(X_train, y_train)
    joblib.dump(model, "models/ensemble_model.pkl")

    # 4. Evaluation (Deliverable 2)
    print("\n📊 Evaluating model...")
    preds = model.predict(X_test)
    print(f"MSE: {mean_squared_error(y_test, preds):.2f}")
    print(f"R²: {r2_score(y_test, preds):.2f}")

    # 5. Visualization (Deliverable 6)
    print("\n🎨 Generating visualizations...")
    os.makedirs("diagrams", exist_ok=True)

    # Forecast plot
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.values[:100], label='Actual', color='blue')
    plt.plot(preds[:100], label='Predicted', color='orange', linestyle='--')
    plt.title("AC Power Forecast (First 100 Samples)")
    plt.xlabel("Time Index")
    plt.ylabel("Power Output (kW)")
    plt.legend()
    plt.savefig("diagrams/forecast_plot.png")
    plt.close()

    # Feature importance
    if hasattr(model, 'feature_importances_'):
        plt.figure(figsize=(10, 5))
        pd.Series(model.feature_importances_, index=X.columns).sort_values().plot.barh()
        plt.title("Feature Importance")
        plt.savefig("diagrams/feature_importance.png")
        plt.close()

    # 6. Save Outputs (Deliverable 8)
    print("\n💾 Saving outputs...")
    pd.DataFrame({
        'actual': y_test,
        'predicted': preds
    }).to_csv("predictions.csv", index=False)

    # 7. Generate Summary (Deliverable 9-11)
    with open("weekly_summary.txt", "w") as f:
        f.write(generate_summary(model, X_test, y_test))

    print("\n✅ Pipeline completed successfully!")
    print("Outputs generated:")
    print("- predictions.csv")
    print("- diagrams/forecast_plot.png")
    print("- diagrams/feature_importance.png")
    print("- weekly_summary.txt")
    print("- models/ensemble_model.pkl")

if __name__ == "__main__":
    main()

=== SOLAR POWER FORECASTING PIPELINE ===

🔍 Loading data...
✅ Loaded 67698 records

🛠 Feature engineering...

🤖 Training ensemble model...

📊 Evaluating model...
MSE: 0.53
R²: 1.00

🎨 Generating visualizations...

💾 Saving outputs...

✅ Pipeline completed successfully!
Outputs generated:
- predictions.csv
- diagrams/forecast_plot.png
- diagrams/feature_importance.png
- weekly_summary.txt
- models/ensemble_model.pkl
