# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Gather Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Feature Engineering

In [3]:
train["fruit_seed"] = train["fruitset"] * train["seeds"]
test["fruit_seed"] = test["fruitset"] * test["seeds"]

# Feature Selection

In [4]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
threshold = 0.5
high_corr_features = [column for column in upper.columns if any(upper[column] > threshold)]
high_corr_features.remove('yield')

In [5]:
train = train[high_corr_features + ['yield']]
test = test[high_corr_features + ['id']]

# Split Data

In [6]:
X = train.drop(columns=['yield'])
y = train[['yield']]

In [7]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state=0)

# Equal Weights

In [8]:
weights = [.5, .5]

# Train Model

In [9]:
# Set up the GradientBoostingRegressor and RandomForestRegressor parameters
gb_params = {
    'n_estimators': 200,
    'max_depth': 8, 
    'learning_rate': 0.04,
    'min_samples_split': 10,
    'min_samples_leaf': 20
}

rf_params = {
    'n_estimators': 150,
    'max_depth': 10,
    'min_samples_split': 10,
    'min_samples_leaf': 20
}

# Create the GradientBoostingRegressor and RandomForestRegressor models
gb_model = GradientBoostingRegressor(**gb_params)
rf_model = RandomForestRegressor(**rf_params)

# Fit 
gb_model.fit(X_train, y_train.values.ravel())
rf_model.fit(X_train, y_train.values.ravel())

# Make Predictions

In [10]:
gb_test_pred = gb_model.predict(X_test)
rf_test_pred = rf_model.predict(X_test)

In [11]:
pred_ens = weights[0]*np.array(gb_test_pred) + weights[1]*np.array(rf_test_pred)

# Evaluate Model

In [12]:
mae_ens = mean_absolute_error(y_test, pred_ens)
print('MAE: %.3f' % mae_ens)

MAE: 356.694


# Create Submission

In [13]:
gb_predictions = gb_model.predict(test.drop(columns = 'id'))
rf_predictions = rf_model.predict(test.drop(columns = 'id'))

In [14]:
predictions = weights[0]*np.array(gb_predictions) + weights[1]*np.array(rf_predictions)
submission = pd.DataFrame({'id': test['id'], 'yield': predictions})
submission

Unnamed: 0,id,yield
0,15289,4321.998141
1,15290,6165.328101
2,15291,7243.111029
3,15292,4720.972312
4,15293,4183.296327
...,...,...
10189,25478,5486.144387
10190,25479,5621.999114
10191,25480,6449.185457
10192,25481,4464.636785
