In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("train.csv")
df.drop(columns=["id", "Row#"], inplace=True, errors="ignore")

In [None]:
q1, q3 = df["yield"].quantile([0.25, 0.75])
iqr = q3 - q1
df = df[(df["yield"] >= q1 - 1.5 * iqr) & (df["yield"] <= q3 + 1.5 * iqr)]

In [None]:
df["bee_activity"] = df["honeybee"] + df["bumbles"] + df["andrena"] + df["osmia"]
df["mass_times_seeds"] = df["fruitmass"] * df["seeds"]
df["rain_per_fruit"] = df["AverageRainingDays"] / (df["fruitset"] + 1e-3)
df["upper_range_diff"] = df["MaxOfUpperTRange"] - df["MinOfUpperTRange"]
df["lower_range_diff"] = df["MaxOfLowerTRange"] - df["MinOfLowerTRange"]

In [None]:
target = df["yield"]
features = df.drop(columns="yield")
correlations = pd.concat([features, target], axis=1).corr()["yield"].abs().sort_values(ascending=False)
top_features = correlations[1:20].index.tolist()


In [None]:
X = df[top_features]
y = df["yield"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
cat = CatBoostRegressor(iterations=1000, depth=9, learning_rate=0.02, l2_leaf_reg=5, random_state=42, verbose=0)
xgb = XGBRegressor(n_estimators=500, learning_rate=0.02, max_depth=7, subsample=0.9, colsample_bytree=0.9, random_state=42, verbosity=0)
lgb = LGBMRegressor(n_estimators=500, learning_rate=0.02, max_depth=7, subsample=0.9, colsample_bytree=0.9, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.02, max_depth=5, random_state=42)


In [None]:
stack = StackingRegressor(
    estimators=[("cat", cat), ("xgb", xgb), ("lgb", lgb), ("gbr", gbr)],
    final_estimator=RidgeCV(alphas=np.logspace(-3, 2, 50)),
    n_jobs=-1
)


In [None]:
stack.fit(X_train, y_train)
y_pred = stack.predict(X_val)

print("R2:", r2_score(y_val, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))
print("MAE:", mean_absolute_error(y_val, y_pred))


In [None]:
test_df = pd.read_csv("test.csv")
test_df["bee_activity"] = test_df["honeybee"] + test_df["bumbles"] + test_df["andrena"] + test_df["osmia"]
test_df["mass_times_seeds"] = test_df["fruitmass"] * test_df["seeds"]
test_df["rain_per_fruit"] = test_df["AverageRainingDays"] / (test_df["fruitset"] + 1e-3)
test_df["upper_range_diff"] = test_df["MaxOfUpperTRange"] - test_df["MinOfUpperTRange"]
test_df["lower_range_diff"] = test_df["MaxOfLowerTRange"] - test_df["MinOfLowerTRange"]

test_scaled = scaler.transform(test_df[top_features])
predictions = stack.predict(test_scaled)


In [None]:
submission = pd.DataFrame({"id": test_df["id"], "target": predictions})
submission.to_csv("Submission.csv", index=False)
print("Karman_Singh_2023362.csv created!")
