<a href="https://colab.research.google.com/github/javadan/DigitalGreenCropYieldEstimateChallenge/blob/main/Meta_digigreen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip --version
!pip install xgboost joblib catboost plotly matplotlib seaborn shap lime scikit-learn featuretools


pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting shap
  Downloading shap-0.43.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (532 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting featuretools
  Downloading featuretools-1.28.0-py3-none-any.whl (619 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.2/619.2 kB[0m [31m34.2 MB/s[0m e

In [None]:
# Standard Library
import os
import json
import sys
from collections import Counter
from typing import List
import traceback
from copy import deepcopy


# Third-party Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from scipy.stats import boxcox
from scipy.optimize import minimize_scalar
from joblib import dump, load
from scipy.stats import ks_2samp
import featuretools as ft
import woodwork as ww
from featuretools.primitives import Mean, Mode, Median
from scipy.stats import zscore

# Scikit-learn
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.feature_selection import RFE
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# XGBoost and CatBoost
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from catboost import Pool



class Visualization:
    def __init__(self):
        pass

    def plot_data(self, data: pd.DataFrame):
        """
        Plot the data distribution.
        """
        try:
            data.hist(bins=50, figsize=(200,150))
            plt.show()
        except Exception as e:
            print(f"An error occurred while plotting data: {e}")


    def plot_model_performance(self, model: BaseEstimator, target: pd.Series, predictions: pd.Series):
        """
        Plot the model performance.
        """
        try:
            # Compute metrics
            mse = mean_squared_error(target, predictions)
            rmse = np.sqrt(mse)

            print(f"Model: {model.__class__.__name__ if model else 'Precomputed'}, RMSE: {rmse}")

            # Create subplots: Use 'domains' type for pie charts
            fig = go.Figure()

            # Residual Analysis Plot
            residuals = target - predictions
            fig.add_trace(go.Scatter(x=target, y=residuals, mode='markers', name='Residuals',
                                    marker=dict(size=2, color='red')))

            # Prediction vs Actual Plot
            fig.add_trace(go.Scatter(x=target, y=predictions, mode='markers', name='Predictions',
                                    marker=dict(size=2, color='blue')))

            # Layout
            fig.update_layout(
                title=f"{model.__class__.__name__ if model else 'Precomputed'} - Model Performance",
                xaxis_title="Actual Values",
                yaxis_title="Predicted Values / Residuals",
                legend_title="Legend",
                font=dict(
                    family="Courier New, monospace",
                    size=12,
                    color="RebeccaPurple"
                )
            )

            # Show plot
            fig.show()

        except Exception as e:
            print(f"An error occurred while plotting model performance: {e}")

    def plot_feature_importance(self, model: BaseEstimator, X_train: pd.DataFrame, model_name: str):
        try:
            # Extract feature importances
            if model_name.lower() == 'catboost':
                train_data = Pool(data=X_train)
                feature_importances = model.get_feature_importance(train_data)
            else:
                feature_importances = model.feature_importances_

            # Standardize the column names to remove any potential case-sensitivity issues
            X_train.columns = [col.lower() for col in X_train.columns]

            # Drop irrelevant columns from X_train before further processing
            drop_cols = ['id']  # Add any other columns that you don't need
            X_train_reduced = X_train.drop(columns=drop_cols, errors='ignore')

            # Validate that the number of features match
            if len(feature_importances) != len(X_train_reduced.columns):
                raise ValueError(f"Feature importances length {len(feature_importances)} does not match number of features {len(X_train_reduced.columns)}.")

            # Create a DataFrame for feature importances
            features_df = pd.DataFrame({
                'Feature': X_train_reduced.columns,
                'Importance': feature_importances
            })

            # Sort the DataFrame based on feature importances
            features_df.sort_values(by='Importance', ascending=False, inplace=True)

            # Print top 10 features for textual interpretation
            print("Top 10 Features:")
            print(features_df.head(10))

            # Plot using Plotly
            fig = go.Figure([go.Bar(x=features_df['Feature'], y=features_df['Importance'])])
            fig.update_layout(
                title=f'Feature Importances for {model_name}',
                xaxis=dict(title='Feature'),
                yaxis=dict(title='Importance')
            )
            fig.show()
        except Exception as e:
            print(f"An error occurred while plotting feature importances for {model_name}: {e}")



class Preprocessing:
    def __init__(self):
        self.scaler = StandardScaler()
        self.categorical_columns_dummies = {}
        self.multi_cat_map = {}
        self.date_columns_mode = {}
        self.numeric_columns_mean = {}


        #Column types
        self.date_columns = ['CropTillageDate', 'RcNursEstDate', 'SeedingSowingTransplanting', 'Harv_date', 'Threshing_date']

        self.multi_categorical_columns = ['LandPreparationMethod', 'NursDetFactor', 'TransDetFactor', 'OrgFertilizers',
                                'CropbasalFerts', 'FirstTopDressFert']


        self.categorical_columns = ['District', 'Block', 'CropEstMethod', 'PCropSolidOrgFertAppMethod',
                                'TransplantingIrrigationSource', 'TransplantingIrrigationPowerSource',
                                'MineralFertAppMethod', 'MineralFertAppMethod.1',
                                'Harv_method', 'Threshing_method', 'Stubble_use']

        self.numeric_columns = ['CultLand', 'CropCultLand', 'CropTillageDepth', 'SeedlingsPerPit', 'TransplantingIrrigationHours',
                              'TransIrriCost', 'StandingWater', 'Ganaura', 'CropOrgFYM', 'NoFertilizerAppln', 'BasalDAP',
                              'BasalUrea', '1tdUrea', '1appDaysUrea', '2tdUrea', '2appDaysUrea', 'Harv_hand_rent',
                              'Residue_length', 'Residue_perc', 'Acre']


    def fit_date_columns(self, data: pd.DataFrame, date_columns: List[str]):
        earliest_date = None
        for column in date_columns:
            data[column] = pd.to_datetime(data[column], errors='coerce')
            self.date_columns_mode[column] = data[column].mode()[0] if not data[column].mode().empty else None

            # Determine the earliest date in this column
            min_date = data[column].min()
            if earliest_date is None or min_date < earliest_date:
                earliest_date = min_date
        self.earliest_date = earliest_date

    def transform_date_columns(self, data: pd.DataFrame, date_columns: List[str]) -> pd.DataFrame:
        for column in date_columns:
            # Skip processing if the column has already been transformed
            if f'Days_from_earliest_to_{column}' in data.columns:
                continue

            # Convert to datetime, if not already
            if not pd.api.types.is_datetime64_any_dtype(data[column]):
                data[column] = pd.to_datetime(data[column], errors='coerce')

            # Fill missing values with mode
            mode = self.date_columns_mode[column]
            if mode:
                data[column].fillna(mode, inplace=True)

            # Create new features and ensure they are of type int
            data[f'Days_from_earliest_to_{column}'] = (data[column] - self.earliest_date).dt.days
            data[f'Days_from_earliest_to_{column}'] = data[f'Days_from_earliest_to_{column}'].astype(int)

            data[column+'_year'] = data[column].dt.year.astype(int)
            data[column+'_month'] = data[column].dt.month.astype(int)
            data[column+'_day'] = data[column].dt.day.astype(int)

            # Drop the original date column
            data.drop(columns=column, inplace=True)

        return data

    def basic_impute(self, data):
        print("Starting basic imputation...")

        # Basic imputation for numeric columns: fill with mean
        numeric_imputer = SimpleImputer(strategy='mean')
        numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
        #print(f"Imputing numeric columns: {numeric_columns}")
        data[numeric_columns] = numeric_imputer.fit_transform(data[numeric_columns])

        # Basic imputation for categorical columns: fill with mode
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        categorical_columns = data.select_dtypes(include=['object', 'category']).columns
        #print(f"Imputing categorical columns: {categorical_columns}")
        data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])

        # Check for remaining NaN values
        nan_counts = data.isna().sum()
        if nan_counts.sum() > 0:
            print("Warning: NaNs present after basic imputation")
            print(nan_counts[nan_counts > 0])
        else:
            print("No NaN values present after basic imputation.")

        print("ID column status:", 'ID' in data.columns)

        return data


    def split_multicategorical(self, data, column):
        unique_categories = set()
        data[column].dropna().str.split(' ').apply(unique_categories.update)

        for category in unique_categories:
            data[f'{column}_{category}'] = data[column].apply(lambda x: 1 if category in str(x).split(' ') else 0)

        return data

    def preprocess_multicategorical_columns(self, data, multi_categorical_columns):
        for column in multi_categorical_columns:
            data = self.split_multicategorical(data, column)
        return data

    def encode_categorical_columns(self, data):
        # Identifying categorical columns, excluding 'ID'
        categorical_columns = data.select_dtypes(include=['object', 'category']).columns
        if 'ID' in categorical_columns:
            categorical_columns = categorical_columns.drop('ID')

        # One-Hot Encoding
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_data = encoder.fit_transform(data[categorical_columns])
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

        # Dropping original categorical columns and concatenating encoded columns
        data = data.drop(columns=categorical_columns).reset_index(drop=True)
        encoded_df = encoded_df.reset_index(drop=True)
        data = pd.concat([data, encoded_df], axis=1)

        # Debugging print after concatenation
        # print("\nAfter concatenating encoded columns:")
        # print("Columns:", data.columns)
        # print("ID column status:", 'ID' in data.columns)

        return data

    def create_entity_set(self, data, date_columns, multi_categorical_columns):
        # Preprocessing multi-categorical columns
        data = self.preprocess_multicategorical_columns(data, multi_categorical_columns)

        # Creating an EntitySet
        es = ft.EntitySet(id="CropData")

        # Convert date columns to datetime if they are not already
        for col in date_columns:
            if col in data.columns and not pd.api.types.is_datetime64_any_dtype(data[col]):
                data[col] = pd.to_datetime(data[col], errors='coerce')

        # Impute missing values
        numeric_imputer = SimpleImputer(strategy='mean')
        categorical_imputer = SimpleImputer(strategy='most_frequent')

        numeric_cols = data.select_dtypes(include=[np.number]).columns
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns

        data[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])
        data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

        # Initialize Woodwork logical types
        data.ww.init(name="TableData", index="ID", logical_types={
            "CropTillageDate": ww.logical_types.Datetime,
            "RcNursEstDate": ww.logical_types.Datetime,
            "SeedingSowingTransplanting": ww.logical_types.Datetime,
            "Harv_date": ww.logical_types.Datetime,
            "Threshing_date": ww.logical_types.Datetime,
            "District": ww.logical_types.Categorical,
            "Block": ww.logical_types.Categorical,
            "CropEstMethod": ww.logical_types.Categorical,
            "PCropSolidOrgFertAppMethod": ww.logical_types.Categorical,
            "TransplantingIrrigationSource": ww.logical_types.Categorical,
            "TransplantingIrrigationPowerSource": ww.logical_types.Categorical,
            "MineralFertAppMethod": ww.logical_types.Categorical,
            "MineralFertAppMethod.1": ww.logical_types.Categorical,
            "Harv_method": ww.logical_types.Categorical,
            "Threshing_method": ww.logical_types.Categorical,
            "Stubble_use": ww.logical_types.Categorical,
            # ... other categorical columns ...
            "CultLand": ww.logical_types.Integer,
            "CropCultLand": ww.logical_types.Integer,
            "CropTillageDepth": ww.logical_types.Integer,
            "SeedlingsPerPit": ww.logical_types.Double,
            "TransplantingIrrigationHours": ww.logical_types.Double,
            "TransIrriCost": ww.logical_types.Double,
            "StandingWater": ww.logical_types.Double,
            "Ganaura": ww.logical_types.Double,
            "CropOrgFYM": ww.logical_types.Double,
            "NoFertilizerAppln": ww.logical_types.Integer,
            "BasalDAP": ww.logical_types.Double,
            "BasalUrea": ww.logical_types.Double,
            "1tdUrea": ww.logical_types.Double,
            "1appDaysUrea": ww.logical_types.Double,
            "2tdUrea": ww.logical_types.Double,
            "2appDaysUrea": ww.logical_types.Double,
            "Harv_hand_rent": ww.logical_types.Double,
            "Residue_length": ww.logical_types.Integer,
            "Residue_perc": ww.logical_types.Integer,
            "Acre": ww.logical_types.Double
            # ... other numeric columns ...
        })

        # Adding the DataFrame to the EntitySet
        es = es.add_dataframe(dataframe_name="TableData", dataframe=data)

        return es



    def generate_interaction_features(self, entity_set):
        # Use DFS to generate features, including interactions
        # Ensure that these features do not involve the target variable
        features, feature_defs = ft.dfs(
            entityset=entity_set,
            target_dataframe_name="TableData",
            trans_primitives=['add_numeric', 'multiply_numeric'],
            max_depth=1,
            ignore_columns={'TableData': ['Yield']}
        )

        # Retrieve the original data from the entity set
        original_data = entity_set['TableData']

        # Combine the original data with the new features
        combined_data = original_data.merge(features, left_index=True, right_index=True)

        # print("\nAfter generate_interaction_features:")
        # print("Combined data columns:", combined_data.columns)
        # print("ID column status in combined data:", 'ID' in combined_data.columns)


        return combined_data

    def generate_target_independent_features(self, data):
        # Generate target-independent features using Featuretools
        entity_set = self.create_entity_set(data, self.date_columns, self.multi_categorical_columns)
        combined_data = self.generate_interaction_features(entity_set)

        # print("\nAfter generate_target_independent_features:")
        # print("Columns:", combined_data.columns)
        # print("ID column status:", 'ID' in combined_data.columns)

        return combined_data

    def generate_target_dependent_features(self, data):
        # Implement feature generation that uses the target variable
        # Example: data['new_feature'] = data['feature1'] * data['target']
        # Add your target-dependent feature generation logic here
        return data


class Models:

    def __init__(self):

        self.data_path = f'/content/drive/MyDrive/digigreen/'
        self.rfe_columns = []

        self.kf = KFold(n_splits=5, shuffle=True, random_state=42)
        self.rmse_scores = {'model1': [], 'model2': [], 'model3': [], 'ensemble_model': []}

        self.model1 = CatBoostRegressor(random_state=123, verbose=0, od_type='Iter', od_wait=20)
        self.model2 = XGBRegressor(random_state=123)
        self.model3 = RandomForestRegressor(random_state=123)

        self.ensemble_model = VotingRegressor([('catboost', self.model1), ('xgb', self.model2), ('rf', self.model3)])

        self.param_grids = {
            'rf': {
                  'n_estimators': [100, 200, 300],  # Number of trees in the forest
                  'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
                  'max_features': ['sqrt', 'log2'],  # The number of features to consider when looking for the best split
                  'min_samples_split': [2, 5, 10],  # The minimum number of samples required to split an internal node
                  'min_samples_leaf': [1, 2, 4],  # The minimum number of samples required to be at a leaf node
                  'bootstrap': [True, False]},  # Whether bootstrap samples are used when building trees

            'xgb': {
                  'n_estimators': [100, 200, 300],  # Number of boosting rounds
                  'max_depth': [10, 20, 30],  # Maximum tree depth for base learners
                  'learning_rate': [0.01, 0.05, 0.1],  # Boosting learning rate
                  'subsample': [0.5, 0.6, 0.7, 0.8],  # Subsample ratio of the training instances
                  'colsample_bytree': [0.6, 0.7, 0.8],  # Subsample ratio of columns when constructing each tree
                  'gamma': [0, 0.1, 0.2]},  # Minimum loss reduction required to make a further partition on a leaf node of the tree

            'catboost': {
                  'depth': [6, 8, 10],  # Depth of the tree
                  'learning_rate': [0.01, 0.05, 0.1],  # Learning rate
                  'iterations': [300, 500, 700, 1000],  # Maximum number of trees that can be built
                  'l2_leaf_reg': [1, 3, 5, 7, 9]}  # Coefficient at the L2 regularization term of the cost function
        }

    @staticmethod
    def inv_boxcox(y_transformed, lambda_best_fit):
        if lambda_best_fit == 0:
            return np.exp(y_transformed)
        else:
            positive_part = lambda_best_fit * y_transformed + 1
            if np.any(positive_part <= 0):
                raise ValueError("Invalid value encountered in inv_boxcox")
            return np.exp(np.log(positive_part) / lambda_best_fit)



    def load_params(self, filename):
        with open(filename, 'r') as f:
            return json.load(f)

    def save_params(self, params, filename):
        with open(filename, 'w') as f:
            json.dump(params, f)

    def perform_grid_search(self, model, param_grid, X_train, y_train, model_name):
        file_name = self.data_path + f'best_params_{model_name}.json'

        if os.path.exists(file_name):
            best_params = self.load_params(file_name)
        else:
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            self.save_params(grid_search.best_params_, file_name)
            best_params = grid_search.best_params_

        return best_params

    def tune_hyperparameters(self, X_train, y_train):
        try:
            print("Tuning hyperparameters for CatBoost")
            best_params_cat = self.perform_grid_search(self.model1, self.param_grids['catboost'], X_train, y_train, 'catboost')
            self.model1.set_params(**best_params_cat)
            print(f"Best parameters for CatBoost: {best_params_cat}")

            print("Tuning hyperparameters for XGBoost")
            best_params_xgb = self.perform_grid_search(self.model2, self.param_grids['xgb'], X_train, y_train, 'xgb')
            self.model2.set_params(**best_params_xgb)
            print(f"Best parameters for XGBoost: {best_params_xgb}")

            print("Tuning hyperparameters for RandomForest")
            best_params_rf = self.perform_grid_search(self.model3, self.param_grids['rf'], X_train, y_train, 'rf')
            self.model3.set_params(**best_params_rf)
            print(f"Best parameters for RandomForest: {best_params_rf}")

        except Exception as e:
            print(f"An error occurred while tuning hyperparameters: {e}")


    def train_model(self, data: pd.DataFrame, target: pd.Series):
        try:
            # Initialize RMSE scores dictionary
            self.rmse_scores = {'ensemble_model1': [], 'ensemble_model2': [], 'ensemble_model3': [], 'ensemble_model': []}

            # Drop 'ID' column
            # data = data.drop(columns=['ID'])

            print(f"Number of features in data: {data.shape[1]}")
            print(f"Number of features in target: {target.shape}")


            # Feature selection using RFE
            rfe_model_path = self.data_path + 'lastone.joblib'
            #rfe_model_path = self.data_path + 'gbfeatures.joblib'
            if os.path.exists(rfe_model_path):
                # Load the existing RFE model
                print("Loading RFE model")
                selector = load(rfe_model_path)
                self.selected_features = selector.support_
            else:
                # Create and fit a new RFE model
                print("Fitting RFE model")
                #selector = RFE(GradientBoostingRegressor(n_estimators=100, random_state=42),
                #               n_features_to_select=75,
                #               step=0.03)
                selector = RFE(RandomForestRegressor(n_estimators=150, random_state=42), n_features_to_select=75, step=0.03)
                selector = selector.fit(data, target)
                self.selected_features = selector.support_
                # Save the fitted RFE model
                dump(selector, rfe_model_path)

            print(f"Number of features selected by RFE: {sum(selector.support_)}")

            if data.shape[1] != len(self.selected_features):
                raise ValueError("Mismatch in the number of features between the data and the RFE model.")

            # Filter the data based on the selected features
            data = data.loc[:, self.selected_features]
            self.rfe_columns = data

            # Initialize arrays for out-of-fold predictions
            oof_predictions_ensemble1 = np.zeros(len(target))
            oof_predictions_ensemble2 = np.zeros(len(target))
            oof_predictions_ensemble3 = np.zeros(len(target))
            oof_predictions_ensemble = np.zeros(len(target))

            # Initialize array for actual targets
            oof_targets = np.zeros(len(target))

            #Added this line:
            y_transformed, self.lambda_best_fit = boxcox(target + 1)  # add 1 to avoid zero values

            # Identify and print all columns with 'object' data type
            object_columns = data.select_dtypes(include=['object']).columns
            print("Columns with 'object' data type:", object_columns.tolist())

            # Cross-validation loop
            fold_number = 0
            for train_index, val_index in self.kf.split(data):



                # Instantiate models for this fold
                ensemble_model1 = CatBoostRegressor(random_state=123, verbose=0, od_type='Iter', od_wait=20)
                ensemble_model2 = XGBRegressor(random_state=123, objective='reg:squarederror', gamma=2)
                ensemble_model3 = RandomForestRegressor(random_state=123)

                X_train_fold, X_val_fold = data.iloc[train_index], data.iloc[val_index]
                y_train_fold, y_val_fold = target.iloc[train_index], target.iloc[val_index]


                # Apply the pre-computed Box-Cox transformation
                y_train_transformed = boxcox(y_train_fold + 1, lmbda=self.lambda_best_fit)


                # Debug: Print data types before model training
                #print("Data types in X_train_fold before training:", X_train_fold.dtypes)
                #print("Data type of y_train_transformed before training:", y_train_transformed.dtype)


                # Train each ensemble model on the training folds and get predictions
                for model_name, model, oof_array in zip(['ensemble_model1', 'ensemble_model2', 'ensemble_model3'],
                                                        [ensemble_model1, ensemble_model2, ensemble_model3],
                                                        [oof_predictions_ensemble1, oof_predictions_ensemble2, oof_predictions_ensemble3]):
                    print(f"Training {model_name} of type {type(model)} on fold {fold_number}, ID: {id(model)}")
                    model.fit(X_train_fold, y_train_transformed)
                    pred = model.predict(X_val_fold)

                    # Debug: Print data types after prediction
                    #print("Data type of pred after prediction:", pred.dtype)


                    pred_transformed = np.clip(pred, 1e-10, None)

                    # Debug: Print data types after transformation
                    #print("Data type of pred_transformed after transformation:", pred_transformed.dtype)

                    final_pred = self.inv_boxcox(pred_transformed, self.lambda_best_fit) - 1

                    # Debug: Print data types after inverse Box-Cox
                    #print("Data type of final_pred after inverse Box-Cox:", final_pred.dtype)

                    oof_array[val_index] = final_pred
                    rmse = np.sqrt(mean_squared_error(y_val_fold, pred))
                    self.rmse_scores[model_name].append(rmse)

                # Train temporary ensemble model
                print(f"Training temporary ensemble model on fold {fold_number}")
                temp_ensemble_model1 = deepcopy(ensemble_model1)
                temp_ensemble_model2 = deepcopy(ensemble_model2)
                temp_ensemble_model3 = deepcopy(ensemble_model3)

                temp_ensemble_model = VotingRegressor(estimators=[('ensemble_model1', temp_ensemble_model1),
                                                                  ('ensemble_model2', temp_ensemble_model2),
                                                                  ('ensemble_model3', temp_ensemble_model3)])
                temp_ensemble_model.fit(X_train_fold, y_train_transformed)
                pred = temp_ensemble_model.predict(X_val_fold)
                pred_transformed = np.clip(pred, 1e-10, None)
                final_pred = self.inv_boxcox(pred_transformed, self.lambda_best_fit) - 1
                oof_predictions_ensemble[val_index] = final_pred
                rmse = np.sqrt(mean_squared_error(y_val_fold, pred))
                self.rmse_scores['ensemble_model'].append(rmse)

                oof_targets[val_index] = y_val_fold.to_numpy()

                fold_number += 1


            # Store these as class attributes
            self.oof_predictions_model1 = oof_predictions_ensemble1
            self.oof_predictions_model2 = oof_predictions_ensemble2
            self.oof_predictions_model3 = oof_predictions_ensemble3
            self.oof_predictions_ensemble = oof_predictions_ensemble

            self.oof_targets = oof_targets

            print("True target Min/Max:", np.min(self.oof_targets), np.max(self.oof_targets))
            print("OOF predictions Min/Max:", np.min(self.oof_predictions_ensemble), np.max(self.oof_predictions_ensemble))

            # Average RMSE over all folds
            avg_rmse_scores = {model: np.mean(scores) for model, scores in self.rmse_scores.items()}
            print("Average RMSE scores:", avg_rmse_scores)

            # Exclude 'ensemble_model' from the weight calculation
            total = sum(1 / score for model, score in avg_rmse_scores.items() if model != 'ensemble_model')
            weights = [(1 / score) / total for model, score in avg_rmse_scores.items() if model != 'ensemble_model']
            print("Calculated weights:", weights)

            # Create the ensemble model with calculated weights
            self.ensemble_model = VotingRegressor(estimators=[('model1', ensemble_model1),
                                                              ('model2', ensemble_model2),
                                                              ('model3', ensemble_model3)],
                                                  weights=weights)

            # Retrain ensemble on the full training data
            #y_transformed, self.lambda_best_fit = boxcox(target + 1)
            #y_transformed = y_transformed + 1e-10
            # Retrain ensemble on the full training data
            y_transformed = boxcox(target + 1, lmbda=self.lambda_best_fit)  # Transform the full target data
            self.ensemble_model.fit(data, y_transformed)

            self.model1 = ensemble_model1
            self.model2 = ensemble_model2
            self.model3 = ensemble_model3

        except Exception as e:
            print(f"An error occurred while training the models: {e}")
            print(traceback.format_exc())
            # Debug: Print data types at the point of failure
            print("Data types in X_train_fold at failure:", X_train_fold.dtypes)
            print("Data type of y_train_transformed at failure:", y_train_transformed.dtype)

    def predict(self, data: pd.DataFrame, is_validation: bool, rmse: float = None, iteration: int = None) -> pd.DataFrame:
        try:
            # Check if 'ID' and 'Acre' columns are present
            has_id = 'ID' in data.columns
            has_acre = 'Acre' in data.columns

            # Handle 'ID' column if present
            if has_id:
                id_series = data['ID'].copy()
                data = data.drop(columns=['ID'])
            else:
                print(f"No 'ID' column present in the data.")

            # Select relevant features for prediction
            data = data.loc[:, self.selected_features]

            # Predict using the ensemble model
            ensemble_predictions = self.ensemble_model.predict(data)
            final_predictions_positive = np.clip(ensemble_predictions, 1e-10, None)

            # Apply inverse Box-Cox transformation
            unboxcoxed_predictions = self.inv_boxcox(final_predictions_positive, self.lambda_best_fit) - 1

            # Calculate Z-scores for the predictions
            z_scores = zscore(unboxcoxed_predictions)

            # Define a threshold for considering a prediction as an outlier
            outlier_threshold = 2.8
            average_yield_to_acre_ratio = 1967.61  # Calculated from the training data

            # Adjust outliers using 'Acre' if available, else use mean
            for i, z in enumerate(z_scores):
                if abs(z) > outlier_threshold:
                    if has_acre:
                        # Adjust the outlier prediction based on Acre
                        unboxcoxed_predictions[i] = data['Acre'].iloc[i] * average_yield_to_acre_ratio
                    else:
                        # Adjust the outlier prediction to the mean if 'Acre' not available
                        unboxcoxed_predictions[i] = np.mean(unboxcoxed_predictions)

            # Construct the submission DataFrame
            if has_id:
                submission_df = pd.DataFrame({'ID': id_series, 'Yield': unboxcoxed_predictions})
            else:
                submission_df = pd.DataFrame({'Yield': unboxcoxed_predictions})

            # Generate filename and save file if not in validation mode
            if not is_validation:
                filename_suffix = f"_RMSE{rmse:.2f}" if rmse is not None else "_RMSEDefault"
                filename_suffix += f"_TotalYield{np.sum(unboxcoxed_predictions):.2f}"
                if iteration is not None:
                    filename_suffix += f"_Iter{iteration}"
                filename = f'EnsembleSubmission{filename_suffix}.csv'
                submission_df.to_csv(filename, index=False)
                print(f"Submission file created: {filename}")

            return submission_df

        except Exception as e:
            print(f"An error occurred while making predictions: {e}")
            raise


class Main:

    def __init__(self, test_size=.2):
        # CV loop params
        self.max_iterations = 50
        self.iterations_between_saves = 1
        self.performance_threshold = 0.00
        self.previous_rmse = float('inf')
        self.iteration = 0
        self.test_size = test_size



        self.train_data = pd.DataFrame()
        self.val_data = pd.DataFrame()
        self.test_data = pd.DataFrame()

        self.preprocessing = Preprocessing()
        self.models = Models()
        self.visualization = Visualization()


        self.data_path = f'/content/drive/MyDrive/digigreen/'
        self.train_csv = self.data_path + 'Train.csv'
        self.test_csv = self.data_path + 'Test.csv'


    def stratified_split(self, data, test_size=0.2, random_state=42, stratify_col='Acre'):
        # Ensure the stratify column is valid and in the data
        if stratify_col and stratify_col in data.columns:
            train, val = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[stratify_col])
        else:
            # Fallback to random split if no valid stratify column is provided
            train, val = train_test_split(data, test_size=test_size, random_state=random_state)

        return train, val


    def find_best_split(self, train_data, test_data, numeric_columns, max_iterations=100, similarity_threshold=0.05, test_size=0.2, random_state=42):
        best_p_value_avg = float('inf')
        best_split = None
        imputer = SimpleImputer(strategy="median")  # Define the imputer outside the loop
        skipped_features_count = 0

        for iteration in range(max_iterations):
            # current_train, current_val = train_test_split(train_data, test_size=test_size, random_state=iteration + random_state)
            current_train, current_val = self.stratified_split(train_data, test_size=test_size, random_state=iteration + random_state, stratify_col='Block')

            p_values = []
            for feature in numeric_columns:
                if feature in current_val.columns and feature in test_data.columns:
                    train_feature = imputer.fit_transform(current_train[[feature]].values)
                    test_feature = imputer.transform(test_data[[feature]].values)

                    if np.isnan(train_feature).any() or np.isnan(test_feature).any():
                        skipped_features_count += 1
                    else:
                        _, p_value = ks_2samp(train_feature.ravel(), test_feature.ravel())
                        p_values.append(p_value)

            p_value_avg = np.nan if not p_values else np.mean(p_values)
            print(f"Iteration {iteration + 1}/{max_iterations}, Average p-value: {p_value_avg}")

            if p_value_avg < best_p_value_avg:
                best_p_value_avg = p_value_avg
                best_split = (current_train, current_val)

            if p_value_avg >= similarity_threshold:
                break

        if not best_split:
            print(f"No suitable split found based on KS test. Using random split. Skipped features due to NaNs: {skipped_features_count}")
            return train_test_split(train_data, test_size=test_size, random_state=random_state)
        else:
            print(f"Using split based on KS test. Skipped features due to NaNs: {skipped_features_count}")
            return best_split

    def load_data_and_preprocess(self, seed=42):
        try:
            print(f"Random seed is : {seed}")

            # Load train and test data
            self.train_data = pd.read_csv(self.train_csv)
            self.test_data = pd.read_csv(self.test_csv)
            print(f"Training data size: {self.train_data.shape}")
            print(f"Test data size: {self.test_data.shape}")

            # Preprocess train and test data separately
            self.preprocess_train_data(self.train_data)
            self.preprocess_test_data(self.test_data)

            # Handle infinities and NaNs
            self.train_data = self.replace_infinities(self.train_data)
            numeric_columns = self.train_data.select_dtypes(include=['number']).columns

            print("Filling NaN values for numeric columns in training data...")
            for col in numeric_columns:
                if self.train_data[col].isna().sum() > 0:
                    print(f"Filling NaN values in column: {col}")
                    self.train_data[col] = self.train_data[col].astype('float32').fillna(0) #self.train_data[col].mean(numeric_only=True))

            #print("Current train data columns:", self.train_data.columns)
            numeric_columns = [col for col in numeric_columns if col in self.train_data.columns and col in self.test_data.columns]

            # Find a better split for training and validation
            self.train_data, self.val_data = self.find_best_split(
                train_data=self.train_data,
                test_data=self.test_data,
                numeric_columns=numeric_columns,
                max_iterations=100,
                similarity_threshold=0.648,
                test_size=self.test_size,
                random_state=seed
            )


            self.val_data = self.align_with_train_data(self.val_data, self.train_data)

            print(f"Training data size post-split: {self.train_data.shape}")
            print(f"Validation data size post-split: {self.val_data.shape}")


        except Exception as e:
            print(f"Error during data loading and preprocessing: {e}")

            print(traceback.format_exc())

    def align_with_train_data(self, dataset, train_data):
        # Add missing columns and fill with zeros
        missing_cols = set(train_data.columns) - set(dataset.columns) - {'ID'}
        for col in missing_cols:
            dataset[col] = 0

        # Reorder columns to match train_data, keeping 'Yield' if present in dataset
        common_columns = [col for col in train_data.columns if col in dataset.columns or col == 'Yield']
        dataset = dataset[common_columns]

        return dataset


    def remove_specified_number_of_outliers(self, data, acre_col, yield_col, num_outliers):
        """
        Remove a specified number of outliers based on polar coordinates.

        :param data: DataFrame with the data.
        :param acre_col: Column name for 'Acre'.
        :param yield_col: Column name for 'Yield'.
        :param num_outliers: Number of outliers to remove.
        :return: DataFrame with specified number of outliers removed.
        """
        # Calculate the polar angle for each data point
        data['Polar_Angle'] = np.arctan2(data[yield_col], data[acre_col])

        # Sort the data by polar angle in descending order
        sorted_data = data.sort_values('Polar_Angle', ascending=False)

        # Remove the top 'num_outliers' data points and reset the index
        filtered_data = sorted_data.iloc[num_outliers:].drop(columns=['Polar_Angle']).reset_index(drop=True)

        return filtered_data


    def replace_infinities(self, data):
        data = data.replace([np.inf, -np.inf], np.nan)
        return data

    def print_data_summary(self, data, step_description):
        pass
        #print(f"\nData Summary after {step_description}:")
        #print("Data types:\n", data.dtypes)
        #print("Sample data:\n", data.head())

    def print_duplicate_columns(self, data, step_description):
        duplicate_columns = data.columns[data.columns.duplicated()]
        if len(duplicate_columns) > 0:
            print(f"Duplicate columns after {step_description}: {duplicate_columns.tolist()}")
        else:
            print(f"No duplicate columns after {step_description}.")


    def check_for_nans(self, data, step_description):
        nan_counts = data.isna().sum()
        if nan_counts.sum() > 0:
            print(f"Warning: NaNs present after {step_description}")
            print(nan_counts[nan_counts > 0])
        else:
            print(f"No NaN values present after {step_description}.")


    def preprocess_train_data(self, train_data):
        print("Preprocessing training data...")

        # Basic imputation
        train_data = self.preprocessing.basic_impute(train_data)

        # Remove 15 outliers
        train_data = self.remove_specified_number_of_outliers(train_data, 'Acre', 'Yield', 15)

        # Generate target-independent features
        train_data = self.preprocessing.generate_target_independent_features(train_data)

        # Fit and transform date columns
        self.preprocessing.fit_date_columns(train_data, self.preprocessing.date_columns)
        train_data = self.preprocessing.transform_date_columns(train_data, self.preprocessing.date_columns)

        # Generate target-dependent features (only for training set)
        train_data = self.preprocessing.generate_target_dependent_features(train_data)

        # Encode categorical columns
        train_data = self.preprocessing.encode_categorical_columns(train_data)

        # Remove duplicate columns
        train_data = train_data.loc[:,~train_data.columns.duplicated()]

        self.train_data = train_data
        print("Training data preprocessed")
        return train_data

    def preprocess_test_data(self, test_data):
        print("Preprocessing test data...")

        # Basic imputation
        test_data = self.preprocessing.basic_impute(test_data)

        # Generate target-independent features
        test_data = self.preprocessing.generate_target_independent_features(test_data)

        # Handle dates
        test_data = self.preprocessing.transform_date_columns(test_data, self.preprocessing.date_columns)

        # Encode categorical columns
        test_data = self.preprocessing.encode_categorical_columns(test_data)


        # Remove duplicate columns
        test_data = test_data.loc[:,~test_data.columns.duplicated()]

        self.test_data = test_data

        self.print_data_summary(test_data, "after One-Hot Encoding (TEST DATA)")
        print("Test data preprocessed")
        return test_data

    def train_models(self):

        # Prepare X_train and target
        self.X_train = self.train_data.drop(['ID', 'Yield'], axis=1)
        target = self.train_data['Yield']

        # Replace infinities and check for NaNs
        self.X_train = self.replace_infinities(self.X_train)
        self.check_for_nans(self.X_train, "After replacing infinities")

        # Final check and fill NaNs
        self.X_train.fillna(self.X_train.mean(numeric_only=True), inplace=True)
        self.check_for_nans(self.X_train, "After final NaN fill")

        # Ensure X_train and X_val have the same columns
        self.X_val = self.align_with_train_data(self.val_data, self.train_data)

        self.models.train_model(self.X_train, target)


    def evaluate_models(self, rmse=None, iteration=None):
        val_predictions = self.validation_predict(rmse=rmse, iteration=iteration)
        new_rmse = np.sqrt(mean_squared_error(self.val_data['Yield'], val_predictions))
        return new_rmse, val_predictions


    #This should already be preprocessed, as it's split after preprocessing train_data
    def validation_predict(self, rmse=None, iteration=None):

        # Separate features from possible target (if exists)
        self.X_val = self.val_data.drop('Yield', axis=1, errors='ignore')

        # Align the test data with the training data
        missing_cols = set(self.X_train.columns) - set(self.X_val.columns)
        for c in missing_cols:
            self.X_val[c] = 0

        self.X_val = self.X_val[self.X_train.columns]

        print(f"Length of X_train columns: {len(self.X_train.columns)}")
        print(f"Length of X_val columns: {len(self.X_val.columns)}")


        predictions = self.models.predict(self.X_val, is_validation=True, rmse=rmse, iteration=iteration)


        # If predictions include the 'ID', extract only the 'Yield' column
        if isinstance(predictions, pd.DataFrame) and 'Yield' in predictions.columns:
            predictions = predictions['Yield']

        return predictions

    def final_predict(self, rmse=None, iteration=None):
        # Retain the ID column from the original test data
        id_series = self.test_data['ID'].copy()

        # Separate features from possible target (if exists)
        self.X_test = self.test_data.drop(['ID', 'Yield'], axis=1, errors='ignore')

        # Identify missing columns and prepare a DataFrame with default values
        missing_cols = set(self.X_train.columns) - set(self.X_test.columns)
        missing_data = pd.DataFrame(0, index=self.X_test.index, columns=list(missing_cols))

        # Concatenate the missing columns DataFrame with the original X_test
        self.X_test = pd.concat([self.X_test, missing_data], axis=1)

        # Ensure the order of columns matches X_train
        self.X_test = self.X_test[self.X_train.columns]

        # Add the ID column back for the final prediction
        self.X_test['ID'] = id_series

        # Generate predictions
        predictions = self.models.predict(self.X_test, is_validation=False, rmse=rmse, iteration=iteration)

        return predictions

    def visualize_validation_results(self, val_predictions):

        # Assuming 'Yield' is the target column in your validation set
        val_targets = self.val_data['Yield']
        try:
            self.visualization.plot_model_performance(self.models.ensemble_model, val_targets, val_predictions)
        except Exception as e:
            print(f"Error visualizing Ensemble model on validation set: {e}")


    def visualize_results(self):

        # Create DataFrames or Series for the true OOF target values and predicted OOF values
        oof_targets = pd.Series(self.models.oof_targets, name='Target')

        # Create Series for the OOF predictions from each model
        oof_predictions_model1 = pd.Series(self.models.oof_predictions_model1, name='CatBoostRegressor')
        oof_predictions_model2 = pd.Series(self.models.oof_predictions_model2, name='XGBRegressor')
        oof_predictions_model3 = pd.Series(self.models.oof_predictions_model3, name='RandomForestRegressor')
        oof_predictions_ensemble = pd.Series(self.models.oof_predictions_ensemble, name='Ensemble')
        #oof_predictions_simple_ensemble = pd.Series(self.models.oof_predictions_simple_ensemble, name='Predicted')

        try:
            self.visualization.plot_model_performance(self.models.model1, oof_targets, oof_predictions_model1)
            self.visualization.plot_feature_importance(self.models.model1, self.models.rfe_columns, 'CatBoost')
        except Exception as e:
            print(f"An error occurred while visualizing CatBoost model: {e}")

        try:
            self.visualization.plot_model_performance(self.models.model2, oof_targets, oof_predictions_model2)
            self.visualization.plot_feature_importance(self.models.model2, self.models.rfe_columns, 'XGBoost')
        except Exception as e:
            print(f"An error occurred while visualizing XGBoost model: {e}")

        try:
            self.visualization.plot_model_performance(self.models.model3, oof_targets, oof_predictions_model3)
            self.visualization.plot_feature_importance(self.models.model3, self.models.rfe_columns, 'Random Forest')
        except Exception as e:
            print(f"An error occurred while visualizing Random Forest model: {e}")

        try:
            self.visualization.plot_model_performance(self.models.ensemble_model, oof_targets, oof_predictions_ensemble)
        except Exception as e:
            print(f"An error occurred while visualizing Ensemble model: {e}")

    def run(self):
        seed = 42
        self.load_data_and_preprocess(seed=seed)

        rmse_history = []
        current_rmse = float('inf')  # Set initial RMSE to infinity
        no_improvement_count = 0

        while self.iteration < self.max_iterations:
            self.train_models()

            # Evaluate models and calculate RMSE
            new_rmse, val_predictions = self.evaluate_models(current_rmse, self.iteration)
            self.visualize_validation_results(val_predictions)

            # Calculate improvement
            improvement = current_rmse - new_rmse
            print(f"Iteration {self.iteration}, RMSE: {new_rmse}, Improvement: {improvement}")

            rmse_history.append(new_rmse)

            # Early stopping check
            if improvement > 0:
                current_rmse = new_rmse
                no_improvement_count = 0
            else:
                no_improvement_count += 1
                if no_improvement_count >= 3:
                    print("Early stopping triggered")
                    break

            self.iteration += 1

            seed += 1
            self.final_predict(rmse=current_rmse, iteration=self.iteration)
            self.visualize_results()
            self.load_data_and_preprocess(seed=seed)

        self.final_predict(rmse=current_rmse, iteration=self.iteration)
        self.visualize_results()
        return rmse_history




# Usage
main = Main(test_size=0.15)  # You can adjust the test_size as needed
rmse_history = main.run()
print("RMSE history:", rmse_history)



Random seed is : 42
Training data size: (3870, 44)
Test data size: (1290, 43)
Preprocessing training data...
Starting basic imputation...
No NaN values present after basic imputation.
ID column status: True
Training data preprocessed
Preprocessing test data...
Starting basic imputation...
No NaN values present after basic imputation.
ID column status: True
Test data preprocessed
Filling NaN values for numeric columns in training data...
Iteration 1/100, Average p-value: 0.6377397626411424
Iteration 2/100, Average p-value: 0.6481230445166587
Using split based on KS test. Skipped features due to NaNs: 0
Training data size post-split: (3276, 3990)
Validation data size post-split: (579, 3990)
No NaN values present after After replacing infinities.
No NaN values present after After final NaN fill.
Number of features in data: 3988
Number of features in target: (3276,)
Loading RFE model
Number of features selected by RFE: 75
Columns with 'object' data type: []
Training ensemble_model1 of type

Iteration 0, RMSE: 348.82319427595775, Improvement: inf
Submission file created: EnsembleSubmission_RMSE348.82_TotalYield643394.29_Iter1.csv
Model: CatBoostRegressor, RMSE: 175.6826857672704


Top 10 Features:
                                  Feature  Importance
70  days_from_earliest_to_croptillagedate   12.290298
46                    2appdaysurea * acre    5.706791
0                                  acre_x    5.459636
10           acre + firsttopdressfert_npk    3.675616
8               acre + cropbasalferts_ssp    3.635490
19        acre + orgfertilizers_jeevamrit    3.572603
13           acre + firsttopdressfert_ssp    3.534375
21        acre + orgfertilizers_pranamrit    3.453682
1                                  acre_y    3.263491
12         acre + firsttopdressfert_other    3.102963


Model: XGBRegressor, RMSE: 182.33535717111135


Top 10 Features:
                                              Feature  Importance
7                          acre + cropbasalferts_npks    0.418634
0                                              acre_x    0.227134
46                                2appdaysurea * acre    0.101967
8                           acre + cropbasalferts_ssp    0.036897
67  landpreparationmethod_bullockplough * transirr...    0.025748
33  harv_hand_rent + transdetfactor_laboravailability    0.019170
70              days_from_earliest_to_croptillagedate    0.012682
16         acre + landpreparationmethod_bullockplough    0.012466
60                         croptillagedepth * ganaura    0.010607
10                       acre + firsttopdressfert_npk    0.010601


Model: RandomForestRegressor, RMSE: 182.67543800392517


Top 10 Features:
                                  Feature  Importance
46                    2appdaysurea * acre    0.200663
21        acre + orgfertilizers_pranamrit    0.131431
13           acre + firsttopdressfert_ssp    0.109963
0                                  acre_x    0.086463
19        acre + orgfertilizers_jeevamrit    0.051127
1                                  acre_y    0.049767
70  days_from_earliest_to_croptillagedate    0.048306
10           acre + firsttopdressfert_npk    0.037531
12         acre + firsttopdressfert_other    0.031084
20    acre + orgfertilizers_poultrymanure    0.025477


Model: VotingRegressor, RMSE: 174.73292695854784


Random seed is : 43
Training data size: (3870, 44)
Test data size: (1290, 43)
Preprocessing training data...
Starting basic imputation...
No NaN values present after basic imputation.
ID column status: True
Training data preprocessed
Preprocessing test data...
Starting basic imputation...
No NaN values present after basic imputation.
ID column status: True
Test data preprocessed
Filling NaN values for numeric columns in training data...
Iteration 1/100, Average p-value: 0.6481230445166587
Using split based on KS test. Skipped features due to NaNs: 0
Training data size post-split: (3276, 3990)
Validation data size post-split: (579, 3990)
No NaN values present after After replacing infinities.
No NaN values present after After final NaN fill.
Number of features in data: 3988
Number of features in target: (3276,)
Loading RFE model
Number of features selected by RFE: 75
Columns with 'object' data type: []
Training ensemble_model1 of type <class 'catboost.core.CatBoostRegressor'> on fold 0,

Iteration 1, RMSE: 321.7645330646883, Improvement: 27.058661211269452
Submission file created: EnsembleSubmission_RMSE321.76_TotalYield650981.69_Iter2.csv
Model: CatBoostRegressor, RMSE: 191.82649807013092


Top 10 Features:
                                  Feature  Importance
70  days_from_earliest_to_croptillagedate   10.413981
46                    2appdaysurea * acre    7.147962
8               acre + cropbasalferts_ssp    5.553278
12         acre + firsttopdressfert_other    4.646586
19        acre + orgfertilizers_jeevamrit    3.480110
1                                  acre_y    3.205318
0                                  acre_x    3.027622
13           acre + firsttopdressfert_ssp    2.992676
10           acre + firsttopdressfert_npk    2.875381
48                        acre * basaldap    2.594758


Model: XGBRegressor, RMSE: 213.49909683947075


Top 10 Features:
                                              Feature  Importance
0                                              acre_x    0.355684
8                           acre + cropbasalferts_ssp    0.137736
46                                2appdaysurea * acre    0.126517
7                          acre + cropbasalferts_npks    0.047988
70              days_from_earliest_to_croptillagedate    0.029690
43                           1tdurea * harv_hand_rent    0.026474
30    cultland + nursdetfactor_irrigwateravailability    0.025773
65             firsttopdressfert_dap * harv_hand_rent    0.020540
33  harv_hand_rent + transdetfactor_laboravailability    0.018137
16         acre + landpreparationmethod_bullockplough    0.012637


Model: RandomForestRegressor, RMSE: 204.213464745918


Top 10 Features:
                                  Feature  Importance
46                    2appdaysurea * acre    0.225120
1                                  acre_y    0.117611
11          acre + firsttopdressfert_npks    0.090068
12         acre + firsttopdressfert_other    0.082445
6               acre + cropbasalferts_mop    0.077943
0                                  acre_x    0.069556
70  days_from_earliest_to_croptillagedate    0.049381
13           acre + firsttopdressfert_ssp    0.043450
7              acre + cropbasalferts_npks    0.021604
8               acre + cropbasalferts_ssp    0.019707


Model: VotingRegressor, RMSE: 198.46142784234974


Random seed is : 44
Training data size: (3870, 44)
Test data size: (1290, 43)
Preprocessing training data...
Starting basic imputation...
No NaN values present after basic imputation.
ID column status: True
Training data preprocessed
Preprocessing test data...
Starting basic imputation...
No NaN values present after basic imputation.
ID column status: True
Test data preprocessed
Filling NaN values for numeric columns in training data...
Iteration 1/100, Average p-value: 0.646177173272191


KeyboardInterrupt: ignored