# WINDTWIN-AI: Data Exploration

This notebook explores the synthetic data generated by WINDTWIN-AI.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Setup plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Data

In [None]:
# Find latest data files
weather_files = list(Path('../data/weather').glob('*.csv'))
scada_files = list(Path('../data/scada').glob('*.csv'))

if not weather_files or not scada_files:
    print("No data found! Generate data first with:")
    print("python main.py generate --days 7")
else:
    # Load latest files
    weather_df = pd.read_csv(weather_files[-1], parse_dates=['timestamp'])
    scada_df = pd.read_csv(scada_files[-1], parse_dates=['timestamp'])
    
    print(f"Loaded weather data: {len(weather_df):,} samples")
    print(f"Loaded SCADA data: {len(scada_df):,} samples")

## 2. Weather Data Overview

In [None]:
# Display basic info
print("Weather Data Summary:")
print(weather_df.describe())
print("\nFirst few rows:")
weather_df.head()

In [None]:
# Plot weather parameters
fig, axes = plt.subplots(3, 1, figsize=(15, 10))

# Wind speed
axes[0].plot(weather_df['timestamp'], weather_df['wind_speed_ms'], linewidth=0.5)
axes[0].set_ylabel('Wind Speed (m/s)')
axes[0].set_title('Weather Conditions Over Time')
axes[0].grid(True, alpha=0.3)

# Temperature
axes[1].plot(weather_df['timestamp'], weather_df['temperature_c'], linewidth=0.5, color='orange')
axes[1].set_ylabel('Temperature (°C)')
axes[1].grid(True, alpha=0.3)

# Pressure
axes[2].plot(weather_df['timestamp'], weather_df['pressure_hpa'], linewidth=0.5, color='green')
axes[2].set_ylabel('Pressure (hPa)')
axes[2].set_xlabel('Time')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. SCADA Data Overview

In [None]:
# Display basic info
print("SCADA Data Summary:")
print(scada_df.describe())
print("\nFirst few rows:")
scada_df.head()

In [None]:
# Plot turbine parameters
fig, axes = plt.subplots(3, 1, figsize=(15, 10))

# Power output
axes[0].plot(scada_df['timestamp'], scada_df['power_kw'], linewidth=0.5)
axes[0].axhline(y=2500, color='r', linestyle='--', label='Rated Power', alpha=0.7)
axes[0].set_ylabel('Power Output (kW)')
axes[0].set_title('Turbine Performance Over Time')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Temperatures
axes[1].plot(scada_df['timestamp'], scada_df['generator_temp_c'], linewidth=0.5, label='Generator')
axes[1].plot(scada_df['timestamp'], scada_df['gearbox_temp_c'], linewidth=0.5, label='Gearbox')
axes[1].set_ylabel('Temperature (°C)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Vibration
axes[2].plot(scada_df['timestamp'], scada_df['vibration_mms'], linewidth=0.5, color='red')
axes[2].set_ylabel('Vibration (mm/s)')
axes[2].set_xlabel('Time')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Power Curve Analysis

In [None]:
# Merge weather and SCADA data
merged_df = pd.merge(weather_df, scada_df, on='timestamp', suffixes=('_weather', '_scada'))

# Plot power curve
plt.figure(figsize=(12, 6))
plt.scatter(merged_df['wind_speed_ms'], merged_df['power_kw'], alpha=0.3, s=1)
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Power Output (kW)')
plt.title('Wind Turbine Power Curve')
plt.axhline(y=2500, color='r', linestyle='--', label='Rated Power')
plt.axvline(x=3, color='g', linestyle='--', alpha=0.5, label='Cut-in Speed')
plt.axvline(x=12, color='b', linestyle='--', alpha=0.5, label='Rated Speed')
plt.axvline(x=25, color='orange', linestyle='--', alpha=0.5, label='Cut-out Speed')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Operational Status Distribution

In [None]:
# Status distribution
status_map = {0: 'Stopped', 1: 'Starting', 2: 'Running', 3: 'Stopping', 4: 'Fault'}
status_counts = scada_df['status'].value_counts()
status_labels = [status_map.get(s, f'Unknown ({s})') for s in status_counts.index]

plt.figure(figsize=(10, 6))
plt.pie(status_counts.values, labels=status_labels, autopct='%1.1f%%', startangle=90)
plt.title('Operational Status Distribution')
plt.axis('equal')
plt.show()

## 6. Energy Production Analysis

In [None]:
# Calculate daily energy production
scada_df['date'] = scada_df['timestamp'].dt.date
daily_energy = scada_df.groupby('date')['power_kw'].sum() * (10/60) / 1000  # MWh

plt.figure(figsize=(12, 6))
plt.bar(range(len(daily_energy)), daily_energy.values)
plt.xlabel('Day')
plt.ylabel('Energy Production (MWh)')
plt.title('Daily Energy Production')
plt.grid(True, alpha=0.3, axis='y')
plt.show()

print(f"\nTotal Energy: {daily_energy.sum():.1f} MWh")
print(f"Average Daily: {daily_energy.mean():.1f} MWh")
print(f"Best Day: {daily_energy.max():.1f} MWh")
print(f"Worst Day: {daily_energy.min():.1f} MWh")

## 7. Correlation Analysis

In [None]:
# Select key columns for correlation
corr_columns = [
    'wind_speed_ms', 'temperature_c', 'pressure_hpa',
    'power_kw', 'rotor_speed_rpm', 'generator_temp_c', 'vibration_mms'
]
corr_df = merged_df[corr_columns]

# Compute correlation matrix
corr_matrix = corr_df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Parameter Correlation Matrix')
plt.tight_layout()
plt.show()

## 8. Next Steps

- Implement digital twin simulation
- Build forecasting models
- Develop anomaly detection
- Create interactive dashboard