In [None]:
from google.colab import files
import zipfile
import os

uploaded = files.upload()

for filename in uploaded.keys():
    if filename.endswith(".zip"):
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall()
        print(f"✅ Extracted: {filename}")


KeyboardInterrupt: 

In [None]:
!pip install -q catboost xgboost

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import matplotlib.pyplot as plt

In [None]:
train_path = "./dataset/train.csv"
test_path = "./dataset/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [None]:
test['efficiency'] = np.nan
combined = pd.concat([train, test], sort=False)

for col in ['humidity', 'wind_speed', 'pressure']:
    combined[col] = pd.to_numeric(combined[col], errors='coerce')

categorical_cols = ['string_id', 'error_code', 'installation_type']
for col in categorical_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

def add_features(df):
    df['power'] = df['voltage'] * df['current']
    df['irradiance_temp_interaction'] = df['irradiance'] * df['temperature']
    df['humidity_pressure_interaction'] = df['humidity'] * df['pressure']
    df['log_irradiance'] = np.log1p(df['irradiance'])
    df['log_soiling_ratio'] = np.log1p(df['soiling_ratio'])
    return df

combined = add_features(combined)

numerical_cols = combined.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['id', 'efficiency']]
imputer = IterativeImputer(max_iter=10, random_state=42)
combined[numerical_cols] = imputer.fit_transform(combined[numerical_cols])

for col in ['irradiance', 'temperature', 'soiling_ratio']:
    upper_limit = combined[col].quantile(0.99)
    combined[col] = combined[col].clip(upper=upper_limit)

scaler = StandardScaler()
combined[numerical_cols] = scaler.fit_transform(combined[numerical_cols])

train = combined[~combined['efficiency'].isna()].copy()
test = combined[combined['efficiency'].isna()].copy()
X = train.drop(columns=['id', 'efficiency'])
y = train['efficiency']
X_test = test.drop(columns=['id', 'efficiency'])

In [None]:
# Splitting Data for Training and Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model Training

from sklearn.svm import SVR
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings('ignore')

gb_model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.03, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)

feature_importance = gb_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
print("Feature Importances:\n", importance_df.sort_values(by='importance', ascending=False))

importance_threshold = np.percentile(feature_importance, 25)  # Selecting top 75% features
important_features = X_train.columns[feature_importance > importance_threshold]
if 'maintenance_count' not in important_features:
    print("Warning: maintenance_count was excluded. Adding it back due to CatBoost expectation.")
    important_features = np.append(important_features, 'maintenance_count')

X_train_selected = X_train[important_features]
X_val_selected = X_val[important_features]
X_test_selected = X_test[important_features]

cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.03, depth=6, verbose=0, random_state=42)
xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.03, max_depth=6, random_state=42)
lgb_model = LGBMRegressor(n_estimators=1000, learning_rate=0.03, max_depth=6, random_state=42)

svr_model = SVR(kernel='rbf')
svr_params = {
    'C': sp_uniform(0.1, 10),
    'gamma': sp_uniform(0.001, 0.1)
}
svr_search = RandomizedSearchCV(svr_model, svr_params, n_iter=5, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
svr_search.fit(X_train_selected, y_train)
best_svr = svr_search.best_estimator_
print("Best SVR parameters:", svr_search.best_params_)

cat_model.fit(X_train_selected, y_train)
xgb_model.fit(X_train_selected, y_train)
gb_model.fit(X_train_selected, y_train)
lgb_model.fit(X_train_selected, y_train)
best_svr.fit(X_train_selected, y_train)

estimators = [
    ('cat', cat_model),
    ('xgb', xgb_model),
    ('gb', gb_model),
    ('lgb', lgb_model),
    ('svr', best_svr)
]

final_estimator = Ridge(alpha=1.0, random_state=42)
stack = StackingRegressor(
    estimators=estimators,
    final_estimator=final_estimator,
    passthrough=True,
    cv=5
)
stack.fit(X_train_selected, y_train)

Feature Importances:
                           feature  importance
1                      irradiance    0.356282
18                 log_irradiance    0.140393
19              log_soiling_ratio    0.099588
3                       panel_age    0.084675
5                   soiling_ratio    0.073160
10                     wind_speed    0.026389
9                  cloud_coverage    0.026329
7                         current    0.023503
11                       pressure    0.023382
8              module_temperature    0.021903
17  humidity_pressure_interaction    0.021882
2                        humidity    0.021521
0                     temperature    0.017794
16    irradiance_temp_interaction    0.017628
15                          power    0.014530
6                         voltage    0.013115
4               maintenance_count    0.008764
12                      string_id    0.003287
13                     error_code    0.003225
14              installation_type    0.002650
Best SVR par

In [None]:
# Evaluating Model and Blending Predictions

print("Features in X_train_selected (training):", X_train_selected.columns.tolist())
print("Features in X_val_selected (validation):", X_val_selected.columns.tolist())

X_val_selected = X_val_selected[X_train_selected.columns]

val_preds_stack = stack.predict(X_val_selected)

val_preds = val_preds_stack

# Evaluate
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
score = 100 * (1 - rmse)
print(f"✅ Validation RMSE: {rmse:.4f} | Score: {score:.2f}")

X_selected = X[important_features]
X_test_selected = X_test[important_features]

X_selected = X_selected[X_train_selected.columns]
X_test_selected = X_test_selected[X_train_selected.columns]

print("Features in X_test_selected (test):", X_test_selected.columns.tolist())

stack.fit(X_selected, y)
final_preds = stack.predict(X_test_selected)
final_preds = np.clip(final_preds, 0, 1)

submission = pd.DataFrame({'id': test['id'], 'efficiency': final_preds})
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv is saved!")
files.download('submission.csv')

Features in X_train_selected (training): ['temperature', 'irradiance', 'humidity', 'panel_age', 'soiling_ratio', 'current', 'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure', 'power', 'irradiance_temp_interaction', 'humidity_pressure_interaction', 'log_irradiance', 'log_soiling_ratio', 'maintenance_count']
Features in X_val_selected (validation): ['temperature', 'irradiance', 'humidity', 'panel_age', 'soiling_ratio', 'current', 'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure', 'power', 'irradiance_temp_interaction', 'humidity_pressure_interaction', 'log_irradiance', 'log_soiling_ratio', 'maintenance_count']
✅ Validation RMSE: 0.1063 | Score: 89.37
Features in X_test_selected (test): ['temperature', 'irradiance', 'humidity', 'panel_age', 'soiling_ratio', 'current', 'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure', 'power', 'irradiance_temp_interaction', 'humidity_pressure_interaction', 'log_irradiance', 'log_soiling_ratio', 'maintenance_cou

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Score not high enough. Consider further tuning or adding more features.
