In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
#np.random.seed(42)


In [4]:
# Load the data
train = pd.read_csv('/kaggle/input/zelestrea/train.csv')
test = pd.read_csv('/kaggle/input/zelestrea/test.csv')

# Function for outlier treatment using IQR method
def treat_outliers_iqr(df, factor=1.5):
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number) and col != 'efficiency':
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - factor * IQR
            upper_bound = Q3 + factor * IQR
            df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df

# Clean the data
df_clean = treat_outliers_iqr(train.copy())


In [5]:
def convert_to_numeric(df):
    df_numeric = df.copy()
    
    # Get all columns that should be numeric (excluding any ID or categorical columns)
    for col in df_numeric.columns:
        if df_numeric[col].dtype == 'object':
            df_numeric[col] = pd.to_numeric(df_numeric[col], errors='coerce')
    
    return df_numeric

# Apply to both datasets
df_clean = convert_to_numeric(df_clean)
test = convert_to_numeric(test)

In [6]:
# Feature Engineering
def engineer_features(df):
    df_eng = df.copy()
    # Convert temperatures to Kelvin for consistency
    df_eng['temperature'] = df_eng['temperature'] + 273.16
    df_eng['module_temperature'] = df_eng['module_temperature'] + 273.16

    # 1. Power-related features
    df_eng['power'] = df_eng['voltage'] * df_eng['current'] 
    df_eng['power_per_irradiance'] = (df_eng['power'] / (df_eng['irradiance'] + 1e-8))
    df_eng['voltage_current_ratio'] = df_eng['voltage'] / (df_eng['current'] + 1e-8)
    
    # 2. Temperature differential features
    df_eng['temp_difference'] = df_eng['module_temperature'] - df_eng['temperature']
    df_eng['temp_efficiency_loss'] = (df_eng['module_temperature'] - (25 + 273.16)) * 0.004
    df_eng['cooling_effect'] = df_eng['wind_speed'] / (df_eng['module_temperature'] + 1e-8)
    
    # 3. Environmental impact features
    df_eng['effective_irradiance'] = df_eng['irradiance'] * (1 - df_eng['cloud_coverage'] / 100)
    df_eng['temp_impact'] = df_eng['temperature'] * df_eng['soiling_ratio']
    df_eng['irradiance_temp_interaction'] = df_eng['irradiance'] * df_eng['module_temperature']
    df_eng['normalized_temp'] = (df_eng['module_temperature'] - 273.16) / 100
    df_eng['cleaned_irradiance'] = df_eng['irradiance'] * (1 - df_eng['soiling_ratio'])
    df_eng['irradiance_adjusted'] = df_eng['irradiance'] * ((100 - df_eng['cloud_coverage']) / 100)
    
    # 4. Panel degradation factors
    df_eng['degradation_factor'] = 1 - (df_eng['panel_age'] * 0.007)
    df_eng['maintenance_impact'] = df_eng['maintenance_count'] / (df_eng['panel_age'] + 1e-8)
    df_eng['normalized_age'] = df_eng['panel_age'] / df_eng['panel_age'].max()
    df_eng['age_maintenance_ratio'] = df_eng['panel_age'] / (df_eng['maintenance_count'] + 1e-8)
    df_eng['panel_efficiency'] = 1 - (0.005 * df_eng['panel_age']) - (0.01 * df_eng['maintenance_count'] / (df_eng['panel_age'] + 1e-8))
    
    # 5. Polynomial features for key variables
    df_eng['temp_squared'] = df_eng['module_temperature'] ** 2
    df_eng['irradiance_squared'] = df_eng['irradiance'] ** 2
    
    # 6. Weather interaction features
    df_eng['pressure_humidity_interaction'] = df_eng['pressure'] * df_eng['humidity']
    df_eng['wind_cloud_interaction'] = df_eng['wind_speed'] * df_eng['cloud_coverage']
    
    return df_eng

# Apply feature engineering
df_eng = engineer_features(df_clean)
test_eng = engineer_features(test)

# Ensure both dataframes have the same columns (except 'efficiency' and 'id')
train_cols = set(df_eng.columns)
test_cols = set(test_eng.columns)

print("Checking for column mismatches...")
if train_cols != test_cols:
    print("Missing in test:", train_cols - test_cols)
    print("Missing in train:", test_cols - train_cols)


Checking for column mismatches...
Missing in test: {'efficiency'}
Missing in train: set()


In [7]:
# One-hot encoding for categorical variables
def one_hot_enc(df):
    df_eng = df.copy()
    categorical_cols = ['string_id', 'error_code', 'installation_type']
    
    # Fill missing values
    for col in categorical_cols:
        df_eng[col] = df_eng[col].fillna(f'unknown_{col}')
    
    # One-hot encode
    df_eng = pd.get_dummies(df_eng, columns=categorical_cols, drop_first=False, dtype=int)
    
    # Drop 'unknown' dummy columns
    for col in categorical_cols:
        unknown_col = f"{col}_unknown_{col}"
        if unknown_col in df_eng.columns:
            df_eng.drop(columns=unknown_col, inplace=True)
            
    return df_eng

# Apply one-hot encoding
df_encoded = one_hot_enc(df_eng)
test_encoded = one_hot_enc(test_eng)


In [8]:
# Prepare data for modeling
from sklearn.model_selection import train_test_split

X = df_encoded.drop(['id', 'efficiency'], axis=1)
y = df_encoded['efficiency']
X_test = test_encoded.drop(['id'], axis=1)

# Create train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure columns match
common_cols = list(set(X.columns) & set(X_test.columns))
X = X[common_cols]
X_test = X_test[common_cols]
X_train = X_train[common_cols]
X_valid = X_valid[common_cols]

# Define the objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1000, 10000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.4, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.4, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),
        "random_state": 420,
        "early_stopping_rounds": 50,
    }
model = xgb.XGBRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)

# Get predictions for both train and valid
train_preds = model.predict(X_train)
valid_preds = model.predict(X_valid)

# Calculate scores for both sets
train_score = 100 * (1 - np.sqrt(mean_squared_error(y_train, train_preds)))
valid_score = 100 * (1 - np.sqrt(mean_squared_error(y_valid, valid_preds)))

print(f"Train score: {train_score:.4f}")
print(f"Valid score: {valid_score:.4f}")

return valid_score  # Optimize based on validation score

# Run Optuna optimization
print("Starting hyperparameter optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500, show_progress_bar=True)


[I 2025-06-09 08:05:17,332] A new study created in memory with name: no-name-c73d3396-002f-4b0a-9aac-b3ac59c8e51f


Starting hyperparameter optimization...


  0%|          | 0/500 [00:00<?, ?it/s]

Train score: 88.3950
Valid score: 88.0431
[I 2025-06-09 08:05:17,911] Trial 0 finished with value: 88.04307536475497 and parameters: {'n_estimators': 1890, 'max_depth': 4, 'learning_rate': 0.05496240333847489, 'subsample': 0.8127251856183283, 'colsample_bytree': 0.7257842128632948, 'colsample_bylevel': 0.5872121289069223, 'colsample_bynode': 0.9807002561649341, 'reg_alpha': 8.818076350785742e-08, 'reg_lambda': 0.3108321096709725, 'min_child_weight': 9, 'gamma': 7.889358953455919, 'max_delta_step': 0, 'scale_pos_weight': 1.1503102234808904}. Best is trial 0 with value: 88.04307536475497.
Train score: 89.1571
Valid score: 88.7619
[I 2025-06-09 08:05:18,435] Trial 1 finished with value: 88.761867849336 and parameters: {'n_estimators': 1074, 'max_depth': 10, 'learning_rate': 0.06277002568459063, 'subsample': 0.725526437368696, 'colsample_bytree': 0.9638449473810302, 'colsample_bylevel': 0.7766977703310667, 'colsample_bynode': 0.8161207205152401, 'reg_alpha': 4.8056407071674895e-06, 'reg_la

In [97]:
#Saved this from previous runs

best_params = {
    "n_estimators": 4836,
    "max_depth": 4,
    "learning_rate": 0.001497800105481533,
    "subsample": 0.5489752556227843,
    "colsample_bytree": 0.939047806124023,
    "colsample_bylevel": 0.9563869745192676,
    "colsample_bynode": 0.9660075556952306,
    "reg_alpha": 8.612196860634e-07,
    "reg_lambda": 9.234917498785065e-07,
    "min_child_weight": 20,
    "gamma": 0.003943004214979823,
    "max_delta_step": 4,
    "scale_pos_weight": 8.722845026253419,
    
}


In [99]:
# Train final model with best parameters
print("\nBest params:")
for key, value in best_params.items():
    print(f"{key}: {value}")

# Train final model
final_model = xgb.XGBRegressor(**best_params, random_state=8000)
final_model.fit(X, y, verbose=False)

# Make predictions using the same columns as training
predictions = final_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],            # Make sure 'test' is defined earlier with 'id' column
    'efficiency': predictions
})

submission.to_csv('submission1.csv', index=False)
print("\ndone 1")







Best params:
n_estimators: 4836
max_depth: 4
learning_rate: 0.001497800105481533
subsample: 0.5489752556227843
colsample_bytree: 0.939047806124023
colsample_bylevel: 0.9563869745192676
colsample_bynode: 0.9660075556952306
reg_alpha: 8.612196860634e-07
reg_lambda: 9.234917498785065e-07
min_child_weight: 20
gamma: 0.003943004214979823
max_delta_step: 4
scale_pos_weight: 8.722845026253419

done 1


In [113]:
import optuna
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

# Prepare data
X = df_encoded.drop(['id', 'efficiency'], axis=1)
y = df_encoded['efficiency']
X_test = test_encoded.drop(['id'], axis=1)

# Train/Validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=8000)

# Match columns
common_cols = list(set(X_train.columns) & set(X_test.columns))
X_train = X_train[common_cols]
X_valid = X_valid[common_cols]
X_test = X_test[common_cols]

# Define Optuna objective for Ridge
def ridge_objective(trial):
    alpha = trial.suggest_float("alpha", 1e-4, 100.0, log=True)

    model = make_pipeline(
        SimpleImputer(strategy='mean'),
        Ridge(alpha=alpha)
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    score = 100 * (1 - rmse)

    return score  # Maximize this

# Run Optuna optimization
print("🔧 Starting Ridge hyperparameter optimization...")
ridge_study = optuna.create_study(direction='maximize')
ridge_study.optimize(ridge_objective, n_trials=500, show_progress_bar=True)

# Best parameters and score
best_alpha = ridge_study.best_params['alpha']
print("✅ Best alpha:", best_alpha)
print(f"📈 Best validation score: {ridge_study.best_value:.5f}")


[I 2025-06-09 13:11:23,227] A new study created in memory with name: no-name-b138d9ff-514b-4462-a2c1-e681c3466b90


🔧 Starting Ridge hyperparameter optimization...


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2025-06-09 13:11:23,296] Trial 0 finished with value: 89.53015253189437 and parameters: {'alpha': 0.0004842280367825412}. Best is trial 0 with value: 89.53015253189437.
[I 2025-06-09 13:11:23,359] Trial 1 finished with value: 89.53016594954858 and parameters: {'alpha': 0.0015692679740106949}. Best is trial 1 with value: 89.53016594954858.
[I 2025-06-09 13:11:23,408] Trial 2 finished with value: 89.53178756774146 and parameters: {'alpha': 0.9467724349433644}. Best is trial 2 with value: 89.53178756774146.
[I 2025-06-09 13:11:23,449] Trial 3 finished with value: 89.5315054677228 and parameters: {'alpha': 16.40235234193766}. Best is trial 2 with value: 89.53178756774146.
[I 2025-06-09 13:11:23,491] Trial 4 finished with value: 89.5317871272798 and parameters: {'alpha': 9.74561004704355}. Best is trial 2 with value: 89.53178756774146.
[I 2025-06-09 13:11:23,535] Trial 5 finished with value: 89.5319445532036 and parameters: {'alpha': 2.186438429825293}. Best is trial 5 with value: 89.531

In [137]:
# Sort columns in same order
X = X[sorted(X.columns)]
X_test = X_test[sorted(X_test.columns)]

# Fit model on full data
final_ridge_model = make_pipeline(
    SimpleImputer(strategy='mean'),
    Ridge(alpha=best_alpha)
)
final_ridge_model.fit(X, y)

# Predict on test set
ridge_predictions = final_ridge_model.predict(X_test)


# Create submission file
submission2 = pd.DataFrame({
    'id': test['id'],            # Make sure 'test' is defined earlier with 'id' column
    'efficiency': predictions
})

submission2.to_csv('submission2.csv', index=False)
print("\ndone 2")



done 2


In [None]:
X_test_xgb = X_test[final_model.get_booster().feature_names]

# Make predictions using the same columns as training
predictions = 0*final_model.predict(X_test_xgb)+1*final_ridge_model.predict(X_test)

# Create submission file
finalsubmission = pd.DataFrame({
    'id': test['id'],
    'efficiency': predictions
})
finalsubmission.to_csv('submission.csv', index=False)
print("\nSubmission file created successfully!")


Submission file created successfully!
