In [None]:
# DOWNLOAD AND UNZIP PARQUET FILES 
!pip install -q gdown
!gdown "https://drive.google.com/file/d/1JvN0-bv7frRf8BAxVrmtJoYV-vzjjt0d/view?usp=sharing"
!unzip -q solar_data.zip -d solar_data


In [None]:
#LOAD THE PARQUET FILES
import pandas as pd
import glob

parquet_files = sorted(glob.glob("solar_data/*.parquet"))

df_all = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

print(f"Loaded combined DataFrame with shape: {df_all.shape}")
print(df_all['TIMESTAMP'].min(), "→", df_all['TIMESTAMP'].max())


In [None]:
import pandas as pd
import numpy as np

df = df_all.copy()

df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], errors='coerce')

df.set_index('TIMESTAMP', inplace=True)


In [None]:
missing_summary = df.isna().sum().sort_values(ascending=False)
print(missing_summary[missing_summary > 0])


In [None]:
#DROPPING WINDREFVMIN AND INTERPOLATING DATA FOR TIME SERIES
df.drop(columns=['WindRef_V_Min'], inplace=True)
df = df.interpolate(method='time')  # best for time series
df = df.fillna(method='bfill').fillna(method='ffill')  # fill edges

In [None]:
print("Remaining NaNs:", df.isna().sum().sum())
df.describe().T
print(df.columns.tolist())



In [None]:
print(df.index)
import pandas as pd
df['RECORD'] = pd.to_datetime(df['RECORD'], errors='coerce')

#Check if any conversions failed
print(df[df['RECORD'].isna()])

In [None]:
num_duplicates = df.index.duplicated().sum()
print(f"Number of duplicate timestamp entries: {num_duplicates}")

timestamp_counts = df.index.value_counts()
print("Timestamps with counts > 1:")
print(timestamp_counts[timestamp_counts > 1])

# Show all rows that share a timestamp with another row, grouped together
duplicate_rows = df[df.index.duplicated(keep=False)]
print("Comparing original and duplicate rows:")
print(duplicate_rows.sort_index().head(20)) # Print the first 20 rows to see a few pairs

In [None]:
print(f"Original DataFrame length: {len(df)}")

# REMOVE DUPLICATES
df_clean = df[~df.index.duplicated(keep='first')]

print(f"Cleaned DataFrame length: {len(df_clean)}")

# VERIFY UNIQUENESS
is_unique = df_clean.index.is_unique
print(f"Is the cleaned index unique? {is_unique}")


In [None]:
# Extract components from the TIMESTAMP
df_clean['year'] = df_clean.index.year
df_clean['month'] = df_clean.index.month
df_clean['day'] = df_clean.index.day
df_clean['hour'] = df_clean.index.hour
df_clean['minute'] = df_clean.index.minute

print(df_clean[['year', 'month', 'day', 'hour', 'minute']].head())

In [None]:
import numpy as np

# CYCLIC ENCODING

df_clean.loc[:, 'month_sin'] = np.sin(2 * np.pi * df_clean['month'] / 12.0)
df_clean.loc[:, 'month_cos'] = np.cos(2 * np.pi * df_clean['month'] / 12.0)

df_clean.loc[:, 'day_sin'] = np.sin(2 * np.pi * df_clean['day'] / 31.0)
df_clean.loc[:, 'day_cos'] = np.cos(2 * np.pi * df_clean['day'] / 31.0)

df_clean.loc[:, 'hour_sin'] = np.sin(2 * np.pi * df_clean['hour'] / 24.0)
df_clean.loc[:, 'hour_cos'] = np.cos(2 * np.pi * df_clean['hour'] / 24.0)

df_clean.loc[:, 'minute_sin'] = np.sin(2 * np.pi * df_clean['minute'] / 60.0)
df_clean.loc[:, 'minute_cos'] = np.cos(2 * np.pi * df_clean['minute'] / 60.0)


# Original time features
original_time_cols = ['year', 'month', 'day', 'hour', 'minute']
# Cyclical time features
cyclical_time_cols = ['month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos']


# Create Cyclical Features & Drop Originals ---
print("Dropping original time columns used for cyclical features...")
# Check if columns exist before dropping
cols_to_drop_time = [col for col in ['month', 'day', 'hour', 'minute'] if col in df_clean.columns]
if cols_to_drop_time:
    df_clean = df_clean.drop(columns=cols_to_drop_time)
    print(f"Dropped: {cols_to_drop_time}")


In [None]:
import pandas as pd



target = 'InvPDC_kW_Avg'  # dc Inverter Power Output in kW

irradiance_features = [
    'RefCell1_Wm2_Avg',
    'SEWSPOAIrrad_Wm2_Avg',
    'Pyra1_Wm2_Avg',
    'Pyra2_Wm2_Avg'
]

temperature_features = [
    'AmbTemp_C_Avg',
    'SEWSModuleTemp_C_Avg',
    'CR1000Temp_C_Avg',
    'SEWSAmbientTemp_C_Avg',
    'RTD_C_Avg_1',
    'RTD_C_Avg_2',
    'RTD_C_Avg_3',
    'RTD_C_Avg_4',
    'RTD_C_Avg_5',
    'RTD_C_Avg_6',
    'RTD_C_Avg_7',
    'RTD_C_Avg_8',
    'RTD_C_Avg_9',
    'RTD_C_Avg_10'
]

time_features = [
    'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos'
]

selected_features = irradiance_features + temperature_features + time_features

# FINAL DATASET
X = df_clean[selected_features]
y = df_clean[target]

print("Selected features:", X.columns.tolist())
print("X shape:", X.shape, "| y shape:", y.shape)


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

target_col = 'InvPDC_kW_Avg'

irradiance_cols = ['RefCell1_Wm2_Avg',
    'SEWSPOAIrrad_Wm2_Avg',
    'Pyra1_Wm2_Avg',
    'Pyra2_Wm2_Avg']
temperature_cols = ['AmbTemp_C_Avg',
    'SEWSModuleTemp_C_Avg',
    'CR1000Temp_C_Avg',
    'SEWSAmbientTemp_C_Avg',
    'RTD_C_Avg_1',
    'RTD_C_Avg_2',
    'RTD_C_Avg_3',
    'RTD_C_Avg_4',
    'RTD_C_Avg_5',
    'RTD_C_Avg_6',
    'RTD_C_Avg_7',
    'RTD_C_Avg_8',
    'RTD_C_Avg_9',
    'RTD_C_Avg_10']
electric_cols = electrical_features = [
    'InvVDVoltage_V_Avg', 'InvIDCin_Avg', 'InvVDCin_Avg', 'InvVPVin_Avg'
]

# LAG FEATURES
lags = [1, 5, 15, 60, 1440]

print("\nCreating lag features...")
# Lag target
if target_col in df_clean.columns:
    for lag in lags:
        df_clean[f'{target_col}_lag_{lag}'] = df_clean[target_col].shift(lag)
else:
    print(f"Warning: Target column '{target_col}' not found for lagging.")

# Lag key predictors
for col in irradiance_cols:
    if col in df_clean.columns:
        for lag in lags:
            df_clean[f'{col}_lag_{lag}'] = df_clean[col].shift(lag)
    else:
         print(f"Warning: Irradiance column '{col}' not found for lagging.")

for col in electric_cols:
    if col in df_clean.columns:
        for lag in lags:
            df_clean[f'{col}_lag_{lag}'] = df_clean[col].shift(lag)


#ROLLING WINDOW STATISTICS
windows = [5, 15, 60]
cols_for_rolling = irradiance_cols + temperature_cols

print("\nCreating rolling window features...")
for col in cols_for_rolling:
    if col in df_clean.columns:
        for window in windows:
            df_clean[f'{col}_roll_mean_{window}'] = df_clean[col].rolling(window=window, min_periods=2).mean()
            df_clean[f'{col}_roll_std_{window}'] = df_clean[col].rolling(window=window, min_periods=2).std()
    else:
        print(f"Warning: Column '{col}' for rolling features not found.")

for window in windows:
    df_clean[f'{target_col}_roll_mean_{window}'] = df_clean[target_col].rolling(window=window, min_periods=1).mean()



# HANDLE NaNs
print(f"\nDataFrame shape before handling NaNs: {df_clean.shape}")
rows_before = len(df_clean)

df_processed = df_clean.dropna()
rows_after = len(df_processed)
print(f"DataFrame shape after handling NaNs: {df_processed.shape}")
print(f"Number of rows dropped due to NaNs: {rows_before - rows_after}")


# FINAL FEATURE LIST

feature_cols = irradiance_cols + temperature_cols

feature_cols += [col for col in cyclical_time_cols if col in df_processed.columns]

if 'year' in df_processed.columns:
    feature_cols += ['year']
feature_cols += [col for col in df_processed.columns if '_lag_' in col]

feature_cols += [col for col in df_processed.columns if '_roll_' in col]


feature_cols = [col for col in feature_cols if col != target_col]

feature_cols = sorted(list(set(feature_cols)))

print(f"\nFinal list of {len(feature_cols)} features selected for X:")





In [None]:
print(feature_cols)

In [None]:
df[target_col].hist(bins=100)

In [None]:
df[target_col] = df[target_col].clip(lower=0)


In [None]:

try:
    X = df_processed[feature_cols]
    y = df_processed[target_col]
except KeyError as e:
    print(f"Error selecting features/target: {e}. Check column names in df_processed.")

except NameError:
    print("Error: 'df_processed' not defined. Ensure previous steps ran.")


# TIME SERIES DATA SPLIT
total_rows = len(df_processed)
train_size = int(total_rows * 0.7)
val_size = int(total_rows * 0.15)

if 'X' in locals() and 'y' in locals():
    X_train = X.iloc[:train_size]
    y_train = y.iloc[:train_size]
    X_val = X.iloc[train_size : train_size + val_size]
    y_val = y.iloc[train_size : train_size + val_size]
    X_test = X.iloc[train_size + val_size :]
    y_test = y.iloc[train_size + val_size :]

    print(f"\nData Split Shapes:")
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_val:   {X_val.shape}, y_val:   {y_val.shape}")
    print(f"X_test:  {X_test.shape}, y_test:  {y_test.shape}")
else:
    print("\nSkipping split and scaling due to previous error.")



In [None]:

# FEATURE SCALING
if 'X_train' in locals():
    scaler = StandardScaler()

    print(f"\nApplying {type(scaler).__name__} scaling...")

    scaler.fit(X_train)

    X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_cols, index=X_train.index)
    print("\nScaled Training Data Head (as DataFrame):")
    print(X_train_scaled_df.head())

    print("\nPreprocessing complete. Ready for model training.")
else:
    print("\nScaling skipped as data splitting failed.")

In [None]:
import cudf
import cuml
from cuml.ensemble import RandomForestRegressor

In [None]:
import pandas as pd
import cudf


X_train.index = X_train.index.tz_localize(None)
X_val.index = X_val.index.tz_localize(None)
X_test.index = X_test.index.tz_localize(None)


X_train_scaled_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index))
y_train_cudf = cudf.Series(y_train.values)
X_val_scaled_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index))
X_test_scaled_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index))
y_val_cudf = cudf.Series(y_val.values)
y_test_cudf = cudf.Series(y_test.values)



In [None]:
#TESTING ON RANDOM FOREST REGRESSOR
rf_model_cuml = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model_cuml.fit(X_train_scaled_cudf, y_train_cudf)

In [None]:

y_pred_val_cudf = rf_model_cuml.predict(X_val_scaled_cudf)
y_pred_test_cudf = rf_model_cuml.predict(X_test_scaled_cudf)

In [None]:
y_pred_val = y_pred_val_cudf.to_pandas()
y_pred_test = y_pred_test_cudf.to_pandas()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score


mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"Validation MSE: {mse_val}, R-squared: {r2_val}")
print(f"Test MSE: {mse_test}, R-squared: {r2_test}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(y_val, label='Actual')
plt.plot(y_pred_val, label='Predicted')
plt.xlabel('Time')
plt.ylabel('InvPDC_kW_Avg')
plt.title('Actual vs. Predicted Values (Validation Set)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
pip install --no-build-isolation mamba-ssm

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch.nn import MSELoss
from mamba_ssm import Mamba
from torch.nn import Linear

In [None]:
# PREPARE DATA
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to("cuda")
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to("cuda")
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to("cuda")
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).to("cuda")
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to("cuda")
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to("cuda")

# RESHAPE
X_train_tensor = X_train_tensor.unsqueeze(1)
X_val_tensor = X_val_tensor.unsqueeze(1)
X_test_tensor = X_test_tensor.unsqueeze(1)

#INITIALIZE
input_size = X_train_tensor.shape[2]
model0 = Mamba(d_model=input_size, d_state=16, d_conv=4, expand=2).to("cuda")

output_layer = Linear(input_size, 1).to("cuda")

optimizer = Adam(list(model0.parameters()) + list(output_layer.parameters()), lr=1e-3)
loss_fn = MSELoss()
num_epochs = 50

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

for epoch in range(num_epochs):
    model0.train()
    output_layer.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model0(data)
        output = output_layer(output.squeeze(1))
        loss = loss_fn(output.squeeze(), target)
        loss.backward()
        optimizer.step()


In [None]:

model0.eval()
output_layer.eval()
with torch.no_grad():
    y_pred_test_tensor = model0(X_test_tensor)
    y_pred_test_tensor = output_layer(y_pred_test_tensor.squeeze(1))
    y_pred_test_tensor = y_pred_test_tensor.squeeze()
y_pred_test = y_pred_test_tensor.cpu().numpy()

mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print(f"Test MSE: {mse_test}, R-squared: {r2_test}")


In [None]:
import matplotlib.pyplot as plt


y_pred_test_tensor = model0(X_test_tensor)
y_pred_test_tensor = output_layer(y_pred_test_tensor.squeeze(1))
y_pred_test_tensor = y_pred_test_tensor.squeeze()
y_pred_test = y_pred_test_tensor.cpu().numpy()

y_test = y_test_tensor.cpu().numpy()

time_index = X_test_tensor[:, 0, 0].cpu().numpy()

plt.figure(figsize=(10, 6))
plt.plot(time_index, y_test, label='Actual')
plt.plot(time_index, y_pred_test, label='Predicted (Mamba)')
plt.xlabel('Time')
plt.ylabel('InvPDC_kW_Avg')
plt.title('Actual vs. Predicted Values (Mamba Model - Test Set)')
plt.legend()
plt.grid(True)
plt.show()