In [7]:
from google.colab import files
uploaded = files.upload()

Saving Solar_Energy_DataSet.zip to Solar_Energy_DataSet.zip


In [8]:
!unzip /content/Solar_Energy_DataSet.zip

Archive:  /content/Solar_Energy_DataSet.zip
  inflating: Plant_1_Generation_Data.csv  
  inflating: Plant_1_Weather_Sensor_Data.csv  
  inflating: Plant_2_Generation_Data.csv  
  inflating: Plant_2_Weather_Sensor_Data.csv  


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [11]:
# Loading generation and weather data
gen_df = pd.read_csv('Plant_1_Generation_Data.csv')
weather_df = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')

print("Generation data shape:", gen_df.shape)
print("Weather data shape:", weather_df.shape)

Generation data shape: (68778, 7)
Weather data shape: (3182, 6)


In [12]:
print("Missing values in generation data:")
print(gen_df.isnull().sum())
print("\nMissing values in weather data:")
print(weather_df.isnull().sum())

# Droping duplicates if any
gen_df.drop_duplicates(inplace=True)
weather_df.drop_duplicates(inplace=True)

# Converting DATE_TIME columns to datetime
gen_df['DATE_TIME'] = pd.to_datetime(gen_df['DATE_TIME'])
weather_df['DATE_TIME'] = pd.to_datetime(weather_df['DATE_TIME'])

Missing values in generation data:
DATE_TIME      0
PLANT_ID       0
SOURCE_KEY     0
DC_POWER       0
AC_POWER       0
DAILY_YIELD    0
TOTAL_YIELD    0
dtype: int64

Missing values in weather data:
DATE_TIME              0
PLANT_ID               0
SOURCE_KEY             0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
dtype: int64


  gen_df['DATE_TIME'] = pd.to_datetime(gen_df['DATE_TIME'])


In [13]:
# Merging on 'DATE_TIME' and 'SOURCE_KEY'
merged_df = pd.merge(gen_df, weather_df, on=['DATE_TIME', 'SOURCE_KEY'])
print("Merged dataset shape:", merged_df.shape)
merged_df.head()

Merged dataset shape: (0, 11)


Unnamed: 0,DATE_TIME,PLANT_ID_x,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_y,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION


In [14]:
# Droping columns not needed for prediction
merged_df = merged_df.drop(columns=['PLANT_ID_x', 'PLANT_ID_y', 'DAILY_YIELD', 'TOTAL_YIELD'])

# Renaming columns for clarity
merged_df.rename(columns={
    'DC_POWER': 'dc_power',
    'AC_POWER': 'ac_power',
    'AMBIENT_TEMPERATURE': 'ambient_temp',
    'MODULE_TEMPERATURE': 'module_temp',
    'IRRADIATION': 'irradiation'
}, inplace=True)

# Extracting date-time features
merged_df['hour'] = merged_df['DATE_TIME'].dt.hour
merged_df['day'] = merged_df['DATE_TIME'].dt.day
merged_df['month'] = merged_df['DATE_TIME'].dt.month

In [15]:
# handling missing zero values

merged_df = merged_df.dropna()

In [16]:
# Feature Scaling

scaler = MinMaxScaler()
cols_to_scale = ['ambient_temp', 'module_temp', 'irradiation', 'hour', 'day', 'month']
merged_df[cols_to_scale] = scaler.fit_transform(merged_df[cols_to_scale])


ValueError: Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required by MinMaxScaler.

In [17]:
gen_df['DATE_TIME'] = pd.to_datetime(gen_df['DATE_TIME']).dt.round('15min')
weather_df['DATE_TIME'] = pd.to_datetime(weather_df['DATE_TIME']).dt.round('15min')

merged_df = pd.merge(gen_df, weather_df, on=['DATE_TIME', 'SOURCE_KEY'])
print("Merged shape:", merged_df.shape)


Merged shape: (0, 11)


In [18]:
print(gen_df.columns)
print(weather_df.columns)

Index(['DATE_TIME', 'PLANT_ID', 'SOURCE_KEY', 'DC_POWER', 'AC_POWER',
       'DAILY_YIELD', 'TOTAL_YIELD'],
      dtype='object')
Index(['DATE_TIME', 'PLANT_ID', 'SOURCE_KEY', 'AMBIENT_TEMPERATURE',
       'MODULE_TEMPERATURE', 'IRRADIATION'],
      dtype='object')


In [19]:
gen_df['DATE_TIME'] = pd.to_datetime(gen_df['DATE_TIME'], errors='coerce')
weather_df['DATE_TIME'] = pd.to_datetime(weather_df['DATE_TIME'], errors='coerce')

gen_df = gen_df.dropna(subset=['DATE_TIME'])
weather_df = weather_df.dropna(subset=['DATE_TIME'])


In [20]:
print(gen_df['DATE_TIME'].head(3))
print(weather_df['DATE_TIME'].head(3))

0   2020-05-15
1   2020-05-15
2   2020-05-15
Name: DATE_TIME, dtype: datetime64[ns]
0   2020-05-15 00:00:00
1   2020-05-15 00:15:00
2   2020-05-15 00:30:00
Name: DATE_TIME, dtype: datetime64[ns]


In [21]:
merged_df = pd.merge(gen_df, weather_df, on=['DATE_TIME', 'SOURCE_KEY'], how='inner')
print("Merged shape:", merged_df.shape)

Merged shape: (0, 11)


In [22]:
print("Generation file times:")
print(gen_df['DATE_TIME'].head(5).tolist())

print("\nWeather file times:")
print(weather_df['DATE_TIME'].head(5).tolist())


Generation file times:
[Timestamp('2020-05-15 00:00:00'), Timestamp('2020-05-15 00:00:00'), Timestamp('2020-05-15 00:00:00'), Timestamp('2020-05-15 00:00:00'), Timestamp('2020-05-15 00:00:00')]

Weather file times:
[Timestamp('2020-05-15 00:00:00'), Timestamp('2020-05-15 00:15:00'), Timestamp('2020-05-15 00:30:00'), Timestamp('2020-05-15 00:45:00'), Timestamp('2020-05-15 01:00:00')]


In [23]:
print("Common SOURCE_KEYs:", set(gen_df['SOURCE_KEY']).intersection(set(weather_df['SOURCE_KEY'])))


Common SOURCE_KEYs: set()


In [24]:
merged_df = pd.merge(gen_df, weather_df, on='DATE_TIME', how='inner')
print("Merged shape:", merged_df.shape)

Merged shape: (68774, 12)


In [25]:
merged_df.columns = merged_df.columns.str.lower().str.replace(' ', '_')


In [26]:
merged_df['hour'] = merged_df['date_time'].dt.hour
merged_df['day'] = merged_df['date_time'].dt.day
merged_df['month'] = merged_df['date_time'].dt.month


In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
cols_to_scale = ['ambient_temperature', 'module_temperature', 'irradiation', 'hour', 'day', 'month']

merged_df[cols_to_scale] = scaler.fit_transform(merged_df[cols_to_scale])
print("Scaled features added successfully")


Scaled features added successfully


In [28]:
from sklearn.model_selection import train_test_split

X = merged_df[['ambient_temperature', 'module_temperature', 'irradiation', 'hour', 'day', 'month']]
y = merged_df['dc_power']  # predicting DC power

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (55019, 6)
Test shape: (13755, 6)


In [29]:
merged_df.to_csv('/content/cleaned_solar_data.csv', index=False)
print("Cleaned dataset saved successfully")

Cleaned dataset saved successfully
