# Notebook: Data Generation and Exploration

"""
# IoT Predictive Maintenance - Data Generation

This notebook demonstrates the data generation process and initial exploration.
"""

# %% Cell 1: Setup and Imports
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.config import get_config
from src.data.sensor_simulator import SensorSimulator
from src.data.data_validator import DataValidator
from src.data.data_loader import DataLoader

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Imports successful")

# %% Cell 2: Load Configuration
config = get_config()

print("Configuration Loaded:")
print(f"  Sampling Rate: {config.get('data_generation.sampling_rate')} Hz")
print(f"  Duration: {config.get('data_generation.duration_days')} days")
print(f"  Number of Machines: {config.get('data_generation.num_machines')}")

# %% Cell 3: Generate Data for One Machine
simulator = SensorSimulator(config)

print("Generating data for machine_001...")
df = simulator.generate_data("machine_001")

print(f"\nGenerated {len(df):,} samples")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# %% Cell 4: Data Overview
print("\n" + "="*80)
print("DATA OVERVIEW")
print("="*80)

print(f"\nShape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\nData Types:")
print(df.dtypes)

print(f"\nMissing Values:")
print(df.isnull().sum())

print(f"\nAnomalies: {df['is_anomaly'].sum()} ({df['is_anomaly'].mean()*100:.2f}%)")

print(f"\nFailure Types:")
print(df['failure_type'].value_counts())

# %% Cell 5: Sensor Statistics
sensor_cols = ['vibration_rms', 'temperature', 'pressure', 'current']

print("\nSensor Statistics:")
print(df[sensor_cols].describe())

# %% Cell 6: Visualize Sensor Data
fig, axes = plt.subplots(4, 1, figsize=(16, 12))

# Plot each sensor
for i, col in enumerate(sensor_cols):
    ax = axes[i]
    
    # Plot normal data
    normal_mask = df['is_anomaly'] == 0
    ax.plot(df.loc[normal_mask, 'timestamp'], 
           df.loc[normal_mask, col], 
           color='blue', alpha=0.5, linewidth=0.5, label='Normal')
    
    # Plot anomalies
    anomaly_mask = df['is_anomaly'] == 1
    ax.scatter(df.loc[anomaly_mask, 'timestamp'], 
              df.loc[anomaly_mask, col], 
              color='red', s=1, alpha=0.8, label='Anomaly')
    
    ax.set_ylabel(col.replace('_', ' ').title())
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)

axes[-1].set_xlabel('Timestamp')
plt.suptitle('Sensor Data Over Time', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/figures/sensor_data_overview.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 7: Failure Event Analysis
failure_summary = simulator.get_failure_summary()

print("\nFailure Events:")
print(failure_summary)

# Plot failure timeline
fig, ax = plt.subplots(figsize=(16, 6))

for idx, row in failure_summary.iterrows():
    start = row['start_time']
    end = row['end_time']
    failure_type = row['failure_type']
    
    ax.barh(idx, (end - start).days, left=start, height=0.8, 
           label=failure_type if failure_type not in ax.get_legend_handles_labels()[1] else "")

ax.set_xlabel('Date')
ax.set_ylabel('Failure Event #')
ax.set_title('Failure Event Timeline', fontsize=16, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('../results/figures/failure_timeline.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 8: Correlation Analysis
correlation = df[sensor_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
           square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Sensor Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/figures/sensor_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 9: Distribution Analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(sensor_cols):
    ax = axes[i]
    
    # Normal distribution
    normal_data = df.loc[df['is_anomaly'] == 0, col]
    ax.hist(normal_data, bins=50, alpha=0.7, label='Normal', color='blue', density=True)
    
    # Anomaly distribution
    anomaly_data = df.loc[df['is_anomaly'] == 1, col]
    if len(anomaly_data) > 0:
        ax.hist(anomaly_data, bins=50, alpha=0.7, label='Anomaly', color='red', density=True)
    
    ax.set_xlabel(col.replace('_', ' ').title())
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('Sensor Data Distributions', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/figures/sensor_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 10: Data Validation
validator = DataValidator()
results = validator.validate(df)
validator.print_report()

# %% Cell 11: Save Data
output_path = config.get('paths.data_synthetic') + '/machine_001_data.csv'
df.to_csv(output_path, index=False)
print(f"\n✓ Data saved to: {output_path}")

# %% Cell 12: Summary
print("\n" + "="*80)
print("GENERATION COMPLETE")
print("="*80)

print(f"\n✓ Generated {len(df):,} samples")
print(f"✓ {len(failure_summary)} failure events created")
print(f"✓ Data quality score: {results['summary']['data_quality_score']:.2%}")
print(f"✓ Saved to: {output_path}")

print(f"\nNext Steps:")
print(f"  1. Feature Engineering")
print(f"  2. Model Development")
print(f"  3. Evaluation")

In [None]:
# Notebook: Data Generation and Exploration

"""
# IoT Predictive Maintenance - Data Generation

This notebook demonstrates the data generation process and initial exploration.
"""

# %% Cell 1: Setup and Imports
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.config import get_config
from src.data.sensor_simulator import SensorSimulator
from src.data.data_validator import DataValidator
from src.data.data_loader import DataLoader

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Imports successful")

# %% Cell 2: Load Configuration
config = get_config()

print("Configuration Loaded:")
print(f"  Sampling Rate: {config.get('data_generation.sampling_rate')} Hz")
print(f"  Duration: {config.get('data_generation.duration_days')} days")
print(f"  Number of Machines: {config.get('data_generation.num_machines')}")

# %% Cell 3: Generate Data for One Machine
simulator = SensorSimulator(config)

print("Generating data for machine_001...")
df = simulator.generate_data("machine_001")

print(f"\nGenerated {len(df):,} samples")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# %% Cell 4: Data Overview
print("\n" + "="*80)
print("DATA OVERVIEW")
print("="*80)

print(f"\nShape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\nData Types:")
print(df.dtypes)

print(f"\nMissing Values:")
print(df.isnull().sum())

print(f"\nAnomalies: {df['is_anomaly'].sum()} ({df['is_anomaly'].mean()*100:.2f}%)")

print(f"\nFailure Types:")
print(df['failure_type'].value_counts())

# %% Cell 5: Sensor Statistics
sensor_cols = ['vibration_rms', 'temperature', 'pressure', 'current']

print("\nSensor Statistics:")
print(df[sensor_cols].describe())

# %% Cell 6: Visualize Sensor Data
fig, axes = plt.subplots(4, 1, figsize=(16, 12))

# Plot each sensor
for i, col in enumerate(sensor_cols):
    ax = axes[i]
    
    # Plot normal data
    normal_mask = df['is_anomaly'] == 0
    ax.plot(df.loc[normal_mask, 'timestamp'], 
           df.loc[normal_mask, col], 
           color='blue', alpha=0.5, linewidth=0.5, label='Normal')
    
    # Plot anomalies
    anomaly_mask = df['is_anomaly'] == 1
    ax.scatter(df.loc[anomaly_mask, 'timestamp'], 
              df.loc[anomaly_mask, col], 
              color='red', s=1, alpha=0.8, label='Anomaly')
    
    ax.set_ylabel(col.replace('_', ' ').title())
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)

axes[-1].set_xlabel('Timestamp')
plt.suptitle('Sensor Data Over Time', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/figures/sensor_data_overview.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 7: Failure Event Analysis
failure_summary = simulator.get_failure_summary()

print("\nFailure Events:")
print(failure_summary)

# Plot failure timeline
fig, ax = plt.subplots(figsize=(16, 6))

for idx, row in failure_summary.iterrows():
    start = row['start_time']
    end = row['end_time']
    failure_type = row['failure_type']
    
    ax.barh(idx, (end - start).days, left=start, height=0.8, 
           label=failure_type if failure_type not in ax.get_legend_handles_labels()[1] else "")

ax.set_xlabel('Date')
ax.set_ylabel('Failure Event #')
ax.set_title('Failure Event Timeline', fontsize=16, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('../results/figures/failure_timeline.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 8: Correlation Analysis
correlation = df[sensor_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
           square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Sensor Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/figures/sensor_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 9: Distribution Analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(sensor_cols):
    ax = axes[i]
    
    # Normal distribution
    normal_data = df.loc[df['is_anomaly'] == 0, col]
    ax.hist(normal_data, bins=50, alpha=0.7, label='Normal', color='blue', density=True)
    
    # Anomaly distribution
    anomaly_data = df.loc[df['is_anomaly'] == 1, col]
    if len(anomaly_data) > 0:
        ax.hist(anomaly_data, bins=50, alpha=0.7, label='Anomaly', color='red', density=True)
    
    ax.set_xlabel(col.replace('_', ' ').title())
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('Sensor Data Distributions', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/figures/sensor_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 10: Data Validation
validator = DataValidator()
results = validator.validate(df)
validator.print_report()

# %% Cell 11: Save Data
output_path = config.get('paths.data_synthetic') + '/machine_001_data.csv'
df.to_csv(output_path, index=False)
print(f"\n✓ Data saved to: {output_path}")

# %% Cell 12: Summary
print("\n" + "="*80)
print("GENERATION COMPLETE")
print("="*80)

print(f"\n✓ Generated {len(df):,} samples")
print(f"✓ {len(failure_summary)} failure events created")
print(f"✓ Data quality score: {results['summary']['data_quality_score']:.2%}")
print(f"✓ Saved to: {output_path}")

print(f"\nNext Steps:")
print(f"  1. Feature Engineering")
print(f"  2. Model Development")
print(f"  3. Evaluation")