In [1]:
from pathlib import Path

import pandas as pd
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from src.data.loader import (
    get_dataset_summary,
    load_generation_data,
    load_solar_data,
    load_weather_data,
    validate_data_integrity,
)

In [3]:
generation_df, weather_df = load_solar_data(Path('../data/raw'))

INFO:src.data.loader:Loaded generation data: 68778 rows, 7 columns
  df["DATE_TIME"] = pd.to_datetime(df["DATE_TIME"])
INFO:src.data.loader:Loaded weather data: 3182 rows, 6 columns


In [4]:
generation_df.head(3)

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,2020-05-15,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0
1,2020-05-15,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0
2,2020-05-15,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0


In [5]:
weather_df.head(3)

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15 00:00:00,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
1,2020-05-15 00:15:00,4135001,HmiyD2TTLFNqkNe,25.084589,22.761668,0.0
2,2020-05-15 00:30:00,4135001,HmiyD2TTLFNqkNe,24.935753,22.592306,0.0


In [6]:
get_dataset_summary()

{'dataset_source': 'Kaggle - Solar Power Generation Data',
 'focus': 'Plant 1 only',
 'time_period': '34 days continuous data',
 'frequency': 'Every 15 minutes',
 'expected_shapes': {'generation': (68778, 7), 'weather': (3182, 6)},
 'primary_target': 'AC_POWER (solar power generation in kW)',
 'generation_columns': ['DATE_TIME',
  'PLANT_ID',
  'SOURCE_KEY',
  'DC_POWER',
  'AC_POWER',
  'DAILY_YIELD',
  'TOTAL_YIELD'],
 'weather_columns': ['DATE_TIME',
  'PLANT_ID',
  'SOURCE_KEY',
  'AMBIENT_TEMPERATURE',
  'MODULE_TEMPERATURE',
  'IRRADIATION']}

In [7]:
validate_data_integrity(generation_df, weather_df)

INFO:src.data.loader:Data validation completed. Found 2 issues.


{'row_counts': {'generation': 68778, 'weather': 3182},
 'date_ranges': {'generation': {'start': Timestamp('2020-05-15 00:00:00'),
   'end': Timestamp('2020-06-17 23:45:00')},
  'weather': {'start': Timestamp('2020-05-15 00:00:00'),
   'end': Timestamp('2020-06-17 23:45:00')}},
 'missing_values': {'generation': {'DATE_TIME': 0,
   'PLANT_ID': 0,
   'SOURCE_KEY': 0,
   'DC_POWER': 0,
   'AC_POWER': 0,
   'DAILY_YIELD': 0,
   'TOTAL_YIELD': 0},
  'weather': {'DATE_TIME': 0,
   'PLANT_ID': 0,
   'SOURCE_KEY': 0,
   'AMBIENT_TEMPERATURE': 0,
   'MODULE_TEMPERATURE': 0,
   'IRRADIATION': 0}},
 'data_quality_issues': ['Generation data contains multiple or incorrect plant IDs',
  'Weather data contains multiple or incorrect plant IDs']}

In [8]:
from src.data.preprocessor import SolarDataPreprocessor

In [9]:
preprocessor = SolarDataPreprocessor(target_frequency='1H', lag_hours=[24, 48, 72], scaling_method='standard')

INFO:src.data.preprocessor:Initialized SolarDataPreprocessor with frequency=1H


In [10]:
preprocessor.is_fitted

False

In [11]:
features_df, metadata = preprocessor.fit_transform(
    "../data/raw/Plant_1_Generation_Data.csv",
    "../data/raw/Plant_1_Weather_Sensor_Data.csv"
)
print(f"Features shape: {features_df.shape}")

INFO:src.data.preprocessor:Starting complete preprocessing pipeline...
INFO:src.data.preprocessor:Loading generation and weather data...
INFO:src.data.preprocessor:Successfully merged data: 68,774 records
INFO:src.data.preprocessor:Creating temporal features...
INFO:src.data.preprocessor:Temporal features created successfully
INFO:src.data.preprocessor:Creating weather-derived features...
INFO:src.data.preprocessor:Weather features created successfully
INFO:src.data.preprocessor:Creating lag features...
INFO:src.data.preprocessor:Lag features created successfully
INFO:src.data.preprocessor:Resampling to 1H frequency...
INFO:src.data.preprocessor:Resampled to 816 records
INFO:src.data.preprocessor:Validating data quality...
INFO:src.data.preprocessor:Data quality score: 1.000
INFO:src.data.preprocessor:Preprocessing complete! Final shape: (796, 45)
INFO:src.data.preprocessor:Total features: 44


Features shape: (796, 45)


In [12]:
preprocessor.is_fitted

True

In [13]:
metadata

{'load_metadata': {'generation_records': 68778,
  'weather_records': 3182,
  'merged_records': 68774,
  'date_range': {'start': Timestamp('2020-05-15 00:00:00'),
   'end': Timestamp('2020-06-17 23:45:00')},
  'merge_success_rate': 0.9999418418680392},
 'quality_report': {'total_records': 816,
  'missing_values': {'DATE_TIME': 0,
   'AC_POWER': 20,
   'DC_POWER': 20,
   'DAILY_YIELD': 20,
   'AMBIENT_TEMPERATURE': 20,
   'MODULE_TEMPERATURE': 20,
   'IRRADIATION': 20,
   'hour': 20,
   'day_of_year': 20,
   'month': 20,
   'weekday': 20,
   'is_daylight': 20,
   'solar_elevation_proxy': 20,
   'hour_sin': 20,
   'hour_cos': 20,
   'day_sin': 20,
   'day_cos': 20,
   'season': 20,
   'temp_difference': 20,
   'temp_ratio': 20,
   'irradiation_per_temp': 20,
   'power_efficiency': 20,
   'temp_irradiation_interaction': 20,
   'temp_category': 20,
   'irradiation_category': 347,
   'ac_power_lag_24h': 20,
   'irradiation_lag_24h': 20,
   'ac_power_lag_48h': 20,
   'irradiation_lag_48h': 20

In [14]:
features_df

Unnamed: 0,DATE_TIME,AC_POWER,DC_POWER,DAILY_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,hour,day_of_year,month,...,season_spring,season_summer,temp_category_cool,temp_category_optimal,temp_category_warm,temp_category_hot,irradiation_category_low,irradiation_category_medium,irradiation_category_high,irradiation_category_peak
0,2020-05-15 00:00:00,0.0,-0.795368,-1.074052,-0.161791,-0.704072,-0.78339,-1.694059,-1.702831,-1.025449,...,True,False,False,True,False,False,False,False,False,False
1,2020-05-15 01:00:00,0.0,-0.795368,-1.074052,-0.264930,-0.722684,-0.78339,-1.548372,-1.702831,-1.025449,...,True,False,False,True,False,False,False,False,False,False
2,2020-05-15 02:00:00,0.0,-0.795368,-1.074052,-0.169556,-0.632283,-0.78339,-1.402686,-1.702831,-1.025449,...,True,False,False,True,False,False,False,False,False,False
3,2020-05-15 03:00:00,0.0,-0.795368,-1.074052,-0.179238,-0.581528,-0.78339,-1.257000,-1.702831,-1.025449,...,True,False,False,True,False,False,False,False,False,False
4,2020-05-15 04:00:00,0.0,-0.795368,-1.074052,-0.384172,-0.746637,-0.78339,-1.111314,-1.702831,-1.025449,...,True,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,2020-06-17 19:00:00,0.0,-0.795368,0.843577,-0.655679,-0.722664,-0.78339,1.073978,1.659566,0.975182,...,False,True,False,True,False,False,False,False,False,False
812,2020-06-17 20:00:00,0.0,-0.795368,0.806543,-0.790648,-0.837760,-0.78339,1.219664,1.659566,0.975182,...,False,True,False,True,False,False,False,False,False,False
813,2020-06-17 21:00:00,0.0,-0.795368,0.806543,-0.771649,-0.779050,-0.78339,1.365350,1.659566,0.975182,...,False,True,False,True,False,False,False,False,False,False
814,2020-06-17 22:00:00,0.0,-0.795368,0.768567,-0.984933,-0.834831,-0.78339,1.511036,1.659566,0.975182,...,False,True,False,True,False,False,False,False,False,False


In [16]:
preprocessor.save_preprocessor('preprocessor.pkl')

INFO:src.data.preprocessor:Preprocessor saved to: preprocessor.pkl


In [17]:
loaded_preprocessor = SolarDataPreprocessor.load_preprocessor('preprocessor.pkl')

INFO:src.data.preprocessor:Preprocessor loaded from: preprocessor.pkl


In [18]:
loaded_preprocessor.is_fitted

True