In [1]:
import warnings
warnings.filterwarnings('ignore')

import ast
from catboost import CatBoostRegressor
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMRegressor
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import optuna
import os

import pandas as pd
from pprint import pprint

import random

from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression, RFECV
from sklearn.inspection import permutation_importance
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler

from scipy import stats

import time
from tqdm.notebook import tqdm

from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.shape, df_test.shape

((15289, 18), (10194, 17))

In [3]:
binary_cols = []

cat_cols = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'RainingDays', 'AverageRainingDays']

num_cols = ['fruitset', 'fruitmass', 'seeds']

ordinal_cols = []

In [4]:
TARGET = 'yield'

In [None]:
df_train_new = df_train.copy()
df_test_new = df_test.copy()

# Pairwise numerical feature operations
for i in range(len(num_cols)):
    for j in range(i, len(num_cols)):
        if i != j:
            feat1 = num_cols[i]
            feat2 = num_cols[j]

            # Multiplication
            df_train_new[f"{feat1}_x_{feat2}"] = df_train[feat1] * df_train[feat2]
            df_test_new[f"{feat1}_x_{feat2}"] = df_test[feat1] * df_test[feat2]

            # Addition
            df_train_new[f"{feat1}_plus_{feat2}"] = df_train[feat1] + df_train[feat2]
            df_test_new[f"{feat1}_plus_{feat2}"] = df_test[feat1] + df_test[feat2]

In [None]:
# Separate features and target for train and validation data

X_train = df_train_new.drop(['id', TARGET], axis=1)
y_train = df_train_new[TARGET]

X_test = df_test_new.drop(['id'], axis=1)

k10 = KFold(n_splits=10, random_state=5, shuffle=True)

In [5]:
# Separate features and target for train and validation data

X_train = df_train.drop(['id', TARGET], axis=1)
y_train = df_train[TARGET]

X_test = df_test.drop(['id'], axis=1)

k10 = KFold(n_splits=3, random_state=5, shuffle=True)

In [6]:
X_train.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.5,0.25,0.75,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887
1,25.0,0.5,0.25,0.5,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781
3,12.5,0.25,0.25,0.63,0.5,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561
4,25.0,0.5,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512


In [7]:
X_test.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.488048,0.442866,36.846956
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,25.0,0.5,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.38886,29.558019


In [8]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=50))
])

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nystroem', Nystroem(n_components=500, random_state=5)),
    ('ridge', Ridge())
])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LinearRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [9]:
models = [
    CatBoostRegressor(random_state=5, verbose=False, early_stopping_rounds=100),
    ExtraTreesRegressor(random_state=5),
    # GaussianNB(),
    HistGradientBoostingRegressor(random_state=5),
    knn_pipeline,
    linear_pipeline,
    LGBMRegressor(n_jobs=-1, random_state=5),
    LinearRegression(),
    RandomForestRegressor(random_state=5),
    XGBRegressor(random_state=5),
]

In [10]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train Accuracy', 
                                        'Model Test Accuracy', 
                                        'Model Test Accuracy Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train Accuracy': 0,
                'Model Test Accuracy': 0,
                'Model Test Accuracy Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='neg_mean_absolute_error', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train Accuracy': -cv_results['train_score'].mean(),
            'Model Test Accuracy': -cv_results['test_score'].mean(),
            'Model Test Accuracy Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test Accuracy'], ascending=True, inplace=True)
    model_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return model_compare

In [None]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X_train.columns)

In [None]:
%%time

baseline_models = evaluate_models(models, X_train, y_train, baseline_features, k10, f'{experiment_name}')
baseline_models

In [None]:
with open('baseline_features.txt', mode='w') as f:
    pprint(baseline_features, stream=f)

# FEATURE SELECTION

### Mutual Information

In [None]:
# To ensure the same randomness everytime
np.random.seed(5)

X_mi = X_train.copy()

# Add random feature
X_mi['random_feature_continous'] = np.round(np.random.uniform(-2, 2, X_train.shape[0]), 6)
X_mi['random_feature_categorical'] = np.random.randint(1, 8, X_train.shape[0])
X_mi.head()

In [None]:
# Combine X and y for mutual information

X_and_y = pd.concat([X_mi, y_train], axis=1)

In [None]:
# Initialize parameters
random_states = [5, 42, 100, 500]
n_neighbors_list = [3, 5, 7, 10, 20]
results = defaultdict(list)

In [None]:
# Calculate MI for each combination of random_state and n_neighbors
for random_state in random_states:
    for n_neighbors in n_neighbors_list:        
        # Calculate MI
        mi = mutual_info_regression(X_and_y, y_train, n_neighbors=n_neighbors, random_state=random_state)
        
        # Store results if the target has the highest MI score
        mi_dict = dict(zip(X_and_y.columns, mi))
        if mi_dict[TARGET] == max(mi_dict.values()):
            for feature, score in mi_dict.items():
                results[feature].append(score)

        print(f'Done with Random State - {random_state} and N Neighbors - {n_neighbors}')

In [None]:
# Average MI scores across valid combinations
average_mi = {feature: np.mean(scores) for feature, scores in results.items() if scores}
average_mi

In [None]:
# Display results
sorted_mi = sorted(average_mi.items(), key=lambda x: x[1], reverse=True)
print("Average MI scores:")
for item in sorted_mi:
    print(item)

In [None]:
# Determine higher MI between 0 and random_feature
higher_threshold = max(0, average_mi.get('random_feature_categorical', 0), average_mi.get('random_feature_continous', 0))
higher_threshold

In [None]:
# List features with MI higher than the threshold, excluding the target
mi_features_list = [feature for feature, score in sorted_mi if feature != TARGET and score > higher_threshold]
mi_features_list

*mutual information features are the same as the original features*

*since the feature set is small we can jump to SFS directly*

### SFS

In [None]:
%%time

# Initialize empty dictionary for SFS features
sfs_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    try:
        # features = feats_so_far[model_name]
        features = baseline_features[model_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X_train[features]

        print(f'Running backward feature selection with {model_name}')

        sfs = SFS(model,
            k_features='best',
            forward=False,
            floating=False,
            scoring='neg_mean_absolute_error',
            verbose=2,
            n_jobs=-1,
            cv=k10)
        
        sfs = sfs.fit(X_sfs, y_train)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        selected_features = list(selected_sfs_feats)

        # # Reorder selected_features based on the predefined features_list
        # selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        sfs_features[model_name] = selected_features

        print(f'Done with {model_name}', end='\n\n')

    except KeyError:
        print(f'{model_name} not in the dictionary.')

# Took 138 minutes

In [None]:
%%time

sfs_models = evaluate_models(models, X_train, y_train, sfs_features, k10, f'{experiment_name}_sfs')
sfs_models

In [None]:
with open('sfs_features.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [11]:
sfs_features = {'CatBoostRegressor': ['honeybee',
                       'MaxOfUpperTRange',
                       'MinOfUpperTRange',
                       'AverageOfUpperTRange',
                       'MinOfLowerTRange',
                       'AverageOfLowerTRange',
                       'AverageRainingDays',
                       'fruitset',
                       'fruitmass',
                       'seeds'],
 'ExtraTreesRegressor': ['clonesize',
                         'andrena',
                         'osmia',
                         'AverageOfLowerTRange',
                         'RainingDays',
                         'fruitset',
                         'fruitmass',
                         'seeds'],
 'HistGradientBoostingRegressor': ['clonesize',
                                   'honeybee',
                                   'MinOfUpperTRange',
                                   'RainingDays',
                                   'fruitset',
                                   'fruitmass',
                                   'seeds'],
 'KNN': ['MaxOfLowerTRange', 'fruitset', 'seeds'],
 'LGBMRegressor': ['clonesize',
                   'MaxOfLowerTRange',
                   'RainingDays',
                   'AverageRainingDays',
                   'fruitset',
                   'fruitmass',
                   'seeds'],
 'LR Pipeline': ['bumbles',
                 'andrena',
                 'osmia',
                 'AverageOfUpperTRange',
                 'MinOfLowerTRange',
                 'AverageRainingDays',
                 'fruitset',
                 'fruitmass',
                 'seeds'],
 'LinearRegression': ['bumbles',
                      'andrena',
                      'osmia',
                      'AverageOfUpperTRange',
                      'MinOfLowerTRange',
                      'AverageRainingDays',
                      'fruitset',
                      'fruitmass',
                      'seeds'],
 'RandomForestRegressor': ['clonesize',
                           'bumbles',
                           'andrena',
                           'osmia',
                           'MaxOfUpperTRange',
                           'MinOfUpperTRange',
                           'AverageOfUpperTRange',
                           'MinOfLowerTRange',
                           'AverageOfLowerTRange',
                           'RainingDays',
                           'AverageRainingDays',
                           'fruitset',
                           'fruitmass',
                           'seeds'],
 'XGBRegressor': ['AverageOfLowerTRange',
                  'AverageRainingDays',
                  'fruitset',
                  'seeds']}


## Best Single Model

In [None]:
model = HistGradientBoostingRegressor(random_state=5)

hist_feats = ['clonesize', 'honeybee', 'MinOfUpperTRange', 'RainingDays', 'fruitset', 'fruitmass', 'seeds']

model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

pred_df = pd.DataFrame(pred, columns=['yield'])
pred_df.head()

In [None]:
test_pred = pd.concat([X_test, pred_df], axis=1)
test_pred

### Manual Correction

In [None]:
# Get list of all feature columns (excluding TARGET)
feature_columns = df_train_new.columns[df_train_new.columns != TARGET]

# Initialize a dictionary to store corrections
corrections = {}

# Iterate over each feature column
for column in feature_columns:
    # Count occurrences of each value in the column
    value_counts = df_train_new[column].value_counts()

    # Filter values that appear at least twice
    values_to_check = value_counts[value_counts >= 2].index
     
    # Iterate over each value in the column that appears at least twice
    for value in values_to_check:
        # Get rows where column equals value
        rows_with_value = df_train_new[df_train_new[column] == value]

        # Check if all rows with this value in column have the same target
        if rows_with_value[TARGET].nunique() == 1:
            target_value = rows_with_value[TARGET].iloc[0]
        
            # Store correction if it matches training pattern
            if (column, value, target_value) not in corrections:
                corrections[(column, value)] = target_value

In [None]:
corrections

In [None]:
# {('fruitset', 0.249334678): 2605.69676, ('fruitset', 0.552101998): 7198.42285}

In [None]:
# Apply corrections to test_pred
for (column, value), target_value in corrections.items():
    test_pred.loc[test_pred[column] == value, TARGET] = target_value

# Print corrected predictions
print("Corrected predictions in test_pred:")
test_pred

In [None]:
test_pred[test_pred['fruitset_x_fruitmass'] == 0.1346910955928016]

In [None]:
submission = pd.read_csv('sample_submission.csv')['id']

submission_df = pd.concat([submission, test_pred[TARGET]], axis=1)
submission_df


In [None]:
submission_df.to_csv('hist_10cv_fe_manualcorrect_353.528914.csv', index=False)

## Stacking

In [12]:
meta_model = Ridge()

In [13]:
%%time

meta_scores = []

for i, (train_idx, meta_idx) in enumerate(k10.split(X_train)):
    print(f'Fold {i + 1}')
    X_train_meta, X_test_meta = X_train.iloc[train_idx], X_train.iloc[meta_idx]
    y_train_meta, y_test_meta = y_train.iloc[train_idx], y_train.iloc[meta_idx]

    print(X_train_meta.shape, X_test_meta.shape, y_train_meta.shape, y_test_meta.shape)
    
    meta_features_fold = np.zeros((X_test_meta.shape[0], len(models)))
    # meta_test_features = np.zeros((y.shape[0], len(models)))
    # meta_targets = np.zeros(y.shape[0])

    for i, model in enumerate(models):
        model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
        print(f'Starting {model_name}')
        model_features = sfs_features[model_name]
        # model_features = baseline_features[model_name]

        # Fit model on the selected features
        model.fit(X_train_meta[model_features], y_train_meta)
        preds = model.predict(X_test_meta[model_features])
        meta_features_fold[:, i] = preds

    # Train the meta-model on the predictions from the base models
    meta_model.fit(meta_features_fold, y_test_meta)
    
    # Predict using the meta-model
    final_preds = meta_model.predict(meta_features_fold)
    
    # Calculate r2_score for the current fold
    current_fold_mae = mean_absolute_error(y_test_meta, final_preds)
    meta_scores.append(current_fold_mae)

Fold 1
(10192, 16) (5097, 16) (10192,) (5097,)
Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting KNN
Starting LR Pipeline
Starting LGBMRegressor
Starting LinearRegression
Starting RandomForestRegressor
Starting XGBRegressor
Fold 2
(10193, 16) (5096, 16) (10193,) (5096,)
Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting KNN
Starting LR Pipeline
Starting LGBMRegressor
Starting LinearRegression
Starting RandomForestRegressor
Starting XGBRegressor
Fold 3
(10193, 16) (5096, 16) (10193,) (5096,)
Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting KNN
Starting LR Pipeline
Starting LGBMRegressor
Starting LinearRegression
Starting RandomForestRegressor
Starting XGBRegressor
CPU times: total: 46.8 s
Wall time: 43.8 s


In [14]:
# Calculate the average MAE across all folds
average_mae = np.mean(meta_scores)
average_mae

# 350.36268333368736 Baseline Features
# 348.51521661303786 SFS Features

348.51521661303786

In [15]:
%%time

# Retrain base models on all data
all_base_model_predictions = []

for model in models:
    model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
    print(f'Starting {model_name}')
    model_features = sfs_features[model_name]
    # model_features = baseline_features[model_name]

    model.fit(X_train[model_features], y_train)
    preds = model.predict(X_test[model_features])

    all_base_model_predictions.append(preds.reshape(-1, 1))

# Stack predictions for the meta model
X_new_meta = np.hstack(all_base_model_predictions)

# Use the meta model to make final predictions
final_predictions = meta_model.predict(X_new_meta)

Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting KNN
Starting LR Pipeline
Starting LGBMRegressor
Starting LinearRegression
Starting RandomForestRegressor
Starting XGBRegressor
CPU times: total: 20 s
Wall time: 19.2 s


In [16]:
final_predictions = pd.DataFrame(final_predictions, columns=[TARGET])
final_predictions = pd.concat([X_test, final_predictions], axis=1)
final_predictions

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569,4269.351324
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.10,0.488048,0.442866,36.846956,6137.848806
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644,7163.532022
3,25.0,0.50,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091,4716.687639
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.388860,29.558019,3903.269167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10189,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.474162,0.437923,34.525258,5488.039186
10190,25.0,0.50,0.25,0.50,0.75,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.482854,0.440676,35.648221,5666.932055
10191,25.0,0.50,0.38,0.50,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.568854,0.463065,37.724724,6513.000072
10192,12.5,0.25,0.25,0.38,0.50,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.407374,0.409261,31.881847,4395.023571


In [17]:
model_names = []
for model in models:
    model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
    model_names.append(model_name)
model_names

print('Ensemble weights')
weights = pd.Series(meta_model.coef_, index=model_names)
print(weights)
print(f'Weights total: {weights.sum()}')
print(f'Intercept: {meta_model.intercept_}', end='\n\n')
print(f"Average Stacking MAE across all folds: {average_mae:.5f}")

Ensemble weights
CatBoostRegressor                0.246222
ExtraTreesRegressor             -0.028942
HistGradientBoostingRegressor    0.018459
KNN                              0.218660
LR Pipeline                      0.086889
LGBMRegressor                    0.222833
LinearRegression                 0.086888
RandomForestRegressor            0.141419
XGBRegressor                     0.025001
dtype: float64
Weights total: 1.0174288926900776
Intercept: -114.7378893499399

Average Stacking MAE across all folds: 348.51522


In [18]:
# Get list of all feature columns (excluding TARGET)
feature_columns = df_train.columns[df_train.columns != TARGET]

# Initialize a dictionary to store corrections
corrections = {}

# Iterate over each feature column
for column in feature_columns:
    # Count occurrences of each value in the column
    value_counts = df_train[column].value_counts()

    # Filter values that appear at least twice
    values_to_check = value_counts[value_counts >= 2].index
     
    # Iterate over each value in the column that appears at least twice
    for value in values_to_check:
        # Get rows where column equals value
        rows_with_value = df_train[df_train[column] == value]

        # Check if all rows with this value in column have the same target
        if rows_with_value[TARGET].nunique() == 1:
            target_value = rows_with_value[TARGET].iloc[0]
        
            # Store correction if it matches training pattern
            if (column, value, target_value) not in corrections:
                corrections[(column, value)] = target_value

In [19]:
corrections

{('fruitset', 0.249334678): 2605.69676, ('fruitset', 0.552101998): 7198.42285}

In [20]:
# Apply corrections to final_predictions
for (column, value), target_value in corrections.items():
    final_predictions.loc[final_predictions[column] == value, TARGET] = target_value

# Print corrected predictions
print("Corrected predictions in final_predictions:")
final_predictions

Corrected predictions in final_predictions:


Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569,4269.351324
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.10,0.488048,0.442866,36.846956,6137.848806
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644,7163.532022
3,25.0,0.50,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091,4716.687639
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.388860,29.558019,3903.269167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10189,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.474162,0.437923,34.525258,5488.039186
10190,25.0,0.50,0.25,0.50,0.75,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.482854,0.440676,35.648221,5666.932055
10191,25.0,0.50,0.38,0.50,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.568854,0.463065,37.724724,6513.000072
10192,12.5,0.25,0.25,0.38,0.50,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.407374,0.409261,31.881847,4395.023571


In [21]:
final_predictions_df = pd.DataFrame(final_predictions[TARGET], columns=[TARGET])
final_predictions_df.head()

Unnamed: 0,yield
0,4269.351324
1,6137.848806
2,7163.532022
3,4716.687639
4,3903.269167


In [22]:
submission = pd.read_csv('sample_submission.csv')['id']

submission_stack_df = pd.concat([submission, final_predictions_df[TARGET]], axis=1)
submission_stack_df

Unnamed: 0,id,yield
0,15289,4269.351324
1,15290,6137.848806
2,15291,7163.532022
3,15292,4716.687639
4,15293,3903.269167
...,...,...
10189,25478,5488.039186
10190,25479,5666.932055
10191,25480,6513.000072
10192,25481,4395.023571


In [23]:
submission_stack_df.to_csv(f'stack_3cv_sfs_manualcorrect_{average_mae}.csv', index=False)