# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Gather Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Feature Engineering

In [3]:
train["fruit_seed"] = train["fruitset"] * train["seeds"]
test["fruit_seed"] = test["fruitset"] * test["seeds"]

# Feature Selection

In [4]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
threshold = 0.5
high_corr_features = [column for column in upper.columns if any(upper[column] > threshold)]
high_corr_features.remove('yield')

In [5]:
train = train[high_corr_features + ['yield']]
test = test[high_corr_features + ['id']]

# Split Data

In [6]:
X = train.drop(columns=['yield'])
y = train[['yield']]

In [7]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state=0)

# Train Model

In [8]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

# Evaluate Model

In [9]:
mae = -cross_val_score(xgb_model, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
print('Mean MAE: %.3f' % mae.mean())

Mean MAE: 373.724


# Create Submission

In [10]:
predictions = xgb_model.predict(test.drop(columns = 'id'))
submission = pd.DataFrame({'id': test['id'], 'yield': predictions})
submission

Unnamed: 0,id,yield
0,15289,4283.971680
1,15290,6262.047363
2,15291,7119.529785
3,15292,4632.464355
4,15293,4019.929688
...,...,...
10189,25478,5450.117676
10190,25479,5597.532715
10191,25480,6472.550293
10192,25481,4471.413086
