In [18]:
import pandas as pd
import numpy as np

In [19]:
df_test = pd.read_csv('test.csv')

In [20]:
df_test.describe()

Unnamed: 0,id,temperature,irradiance,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage
count,12000.0,11418.0,11385.0,11393.0,11391.0,11390.0,11453.0,11413.0,11420.0,11418.0
mean,5999.5,25.146375,503.680996,17.473594,3.991836,0.700468,16.520001,1.708212,30.038138,51.324166
std,3464.24595,12.278231,250.888332,10.095197,1.995215,0.172455,18.402507,1.147612,12.216873,45.723208
min,0.0,0.0,-564.252322,0.013553,0.0,0.400042,0.0,6.5e-05,0.0,0.001065
25%,2999.75,16.768567,334.595229,8.612788,3.0,0.553302,0.0,0.767381,21.478909,25.07972
50%,5999.5,24.887335,505.037516,17.402885,4.0,0.697455,12.966453,1.565075,29.952897,49.780073
75%,8999.25,33.143483,671.762708,26.134488,5.0,0.850781,27.020573,2.459744,38.315531,75.191692
max,11999.0,145.879677,1420.627376,34.989441,13.0,0.999976,417.682763,7.256391,65.0,1000.0


In [21]:
#replaces all values less than 0 with 0
df_test['irradiance'] = df_test['irradiance'].clip(lower=0)

In [22]:
df_test['error_code'] = df_test['error_code'].fillna('missing')
df_test['installation_type'] = df_test['installation_type'].fillna('missing')

In [23]:
cols_to_convert = ['humidity', 'wind_speed', 'pressure']

# Remove non-numeric characters if any, then convert to float
for col in cols_to_convert:
    df_test[col] = pd.to_numeric(df_test[col], errors='coerce')

In [24]:
filtered_df = df_test.copy()

In [25]:
filtered_df[filtered_df.select_dtypes(include='number').columns] = filtered_df.select_dtypes(include='number').fillna(filtered_df.median(numeric_only=True))

In [26]:
## drop the id ccolumn
ids = filtered_df['id']
filtered_df = filtered_df.drop(columns = ['id'])

In [27]:
# Specify the low‐cardinality categorical columns to one‐hot encode
cols_to_encode = ['string_id', 'error_code', 'installation_type']

# Perform one‐hot encoding using pandas.get_dummies
filtered_df_encoded = pd.get_dummies(
    filtered_df,
    columns=cols_to_encode,
    prefix=cols_to_encode,
    drop_first=False  # keep all dummy columns (you can set True to avoid multicollinearity)
)

# View the first few rows of the encoded DataFrame
filtered_df_encoded.head()

Unnamed: 0,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,...,string_id_C3,string_id_D4,error_code_E00,error_code_E01,error_code_E02,error_code_missing,installation_type_dual-axis,installation_type_fixed,installation_type_missing,installation_type_tracking
0,17.618379,85.449838,90.815423,13.910963,6.0,0.889765,6.370396,0.069101,19.517274,33.509889,...,True,False,False,True,False,False,False,False,False,True
1,34.826323,722.801748,20.982993,20.916528,4.0,0.590372,30.095867,1.713852,37.421443,32.32706,...,False,True,True,False,False,False,False,False,True,False
2,33.776934,485.491998,55.61405,1.446962,3.0,0.611425,28.42443,1.696936,32.147763,69.613333,...,False,True,False,True,False,False,True,False,False,False
3,18.584189,350.02272,49.044766,18.810133,5.0,0.697455,7.848038,0.787188,25.734118,42.86276,...,True,False,False,False,True,False,True,False,False,False
4,43.044908,437.295622,8.761571,17.402885,8.0,0.564938,12.300717,1.86762,29.952897,51.025763,...,False,False,False,False,False,True,False,True,False,False


In [28]:
# Add new derived features
filtered_df_encoded['power'] = filtered_df_encoded['voltage'] * filtered_df_encoded['current']
filtered_df_encoded['temperature_delta'] = filtered_df_encoded['module_temperature'] - filtered_df_encoded['temperature']
filtered_df_encoded['maintenance_impact'] = filtered_df_encoded['panel_age'] / (filtered_df_encoded['maintenance_count'] + 1)
filtered_df_encoded['cloud_impact'] = filtered_df_encoded['cloud_coverage'] * filtered_df_encoded['irradiance']

In [29]:
import pickle

model_path = "C:\Local Disk F\ML\Zelestra ML Challenge\Models\lightgbm_model.pkl"

with open(model_path, 'rb') as file:
    loaded_model = pickle.load(file)

In [30]:
predictions = loaded_model.predict(filtered_df_encoded)

In [31]:
submission_df = pd.DataFrame({
    'id': ids,
    'efficiency': predictions
})

In [32]:
import os, re

path = r'C:\Local Disk F\ML\Zelestra ML Challenge\Submission files'
files = [int(re.findall(r'\d+', f)[0]) for f in os.listdir(path) if re.match(r'sub\d+\.csv$', f)]
next_ver = max(files, default=0) + 1
submission_df.to_csv(os.path.join(path, f'sub{next_ver}.csv'), index=False)