# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Gather Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Feature Engineering

In [3]:
train["fruit_seed"] = train["fruitset"] * train["seeds"]
test["fruit_seed"] = test["fruitset"] * test["seeds"]

# Feature Selection

In [4]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
threshold = 0.5
high_corr_features = [column for column in upper.columns if any(upper[column] > threshold)]
high_corr_features.remove('yield')

In [5]:
train = train[high_corr_features + ['yield']]
test = test[high_corr_features + ['id']]

# Performance-based Weights

In [6]:
folds = 10

gb_cv_scores, gb_preds = list(), list()
rf_cv_scores, rf_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

kfold = RepeatedKFold(n_splits = folds, n_repeats = 1, random_state = 42)

for i, (train_subset, test_subset) in enumerate(kfold.split(train.drop(columns=['yield']), train[['yield']])):
    
    X_train, X_test = train.drop(columns=['yield']).iloc[train_subset], train.drop(columns=['yield']).iloc[test_subset]
    y_train, y_test = train[['yield']].iloc[train_subset], train[['yield']].iloc[test_subset]
    
    # fit gb model
    gb_model = GradientBoostingRegressor(loss = 'absolute_error',
        n_estimators = 200, 
        max_depth = 8, 
        learning_rate = 0.04,
        min_samples_split = 10, 
        min_samples_leaf = 20).fit(X_train, y_train.values.ravel()) 
    
    gb_test_pred = gb_model.predict(X_test)
    gb_predictions = gb_model.predict(test.drop(columns = 'id'))
    
    gb_score = mean_absolute_error(y_test, gb_test_pred) 
    gb_cv_scores.append(gb_score)
    gb_preds.append(gb_predictions)
    

    # fit rf model
    rf_model = RandomForestRegressor(criterion = 'absolute_error',
        n_estimators = 150, 
        max_depth = 10, 
        min_samples_split = 10, 
        min_samples_leaf = 20,
        n_jobs = -1).fit(X_train,y_train.values.ravel()) 

    rf_test_pred = rf_model.predict(X_test)
    rf_predictions = rf_model.predict(test.drop(columns = 'id'))
    
    rf_score = mean_absolute_error(y_test, rf_test_pred)
    rf_cv_scores.append(rf_score)
    rf_preds.append(rf_predictions)

    # weights
    weights = 1 / np.array([gb_score, rf_score])
    total_weights = np.sum(weights)
    weights = weights / total_weights
    
    # ensamble model
    ens_test_pred = weights[0]*np.array(gb_test_pred) + weights[1]*np.array(rf_test_pred)
    ens_predictions = weights[0]*np.array(gb_predictions) + weights[1]*np.array(rf_predictions)
    
    ens_score = mean_absolute_error(y_test, ens_test_pred)
    ens_cv_scores.append(ens_score)
    ens_preds.append(ens_predictions)


In [7]:
weights_ens = 1 / np.array(ens_cv_scores)
total_weights_ens = np.sum(weights_ens)
weights_ens = weights_ens / total_weights_ens

In [8]:
weights_ens

array([0.10214634, 0.09943672, 0.09422462, 0.0974444 , 0.10564696,
       0.09936114, 0.09947442, 0.1033716 , 0.09801457, 0.10087923])

# Evaluate Model

In [9]:
ens_cv_score = np.mean(ens_cv_scores)
print('MAE: %.3f' % ens_cv_score)

MAE: 344.251


# Create Submission

In [10]:
predictions = 0
for i in range(0, folds):
    predictions += weights_ens[i] * np.array(ens_preds[i]) 

submission = pd.DataFrame({'id': test['id'], 'yield': predictions})
submission

Unnamed: 0,id,yield
0,15289,4279.637471
1,15290,5876.072655
2,15291,7248.603191
3,15292,4745.923754
4,15293,3813.539108
...,...,...
10189,25478,5427.351321
10190,25479,5632.629527
10191,25480,6490.909502
10192,25481,4441.719074
