In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [3]:
train = pd.read_csv(r'C:\Users\USER\OneDrive\Desktop\machine_learning\Week2\train.csv')
test = pd.read_csv(r'C:\Users\USER\OneDrive\Desktop\machine_learning\Week2\test.csv')

In [4]:
train.head()

Unnamed: 0,id,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,0,25.0,0.5,0.25,0.75,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146
1,1,25.0,0.5,0.25,0.5,0.5,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201
2,2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.7776
3,3,12.5,0.25,0.25,0.63,0.5,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.7759
4,4,25.0,0.5,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417


In [6]:
test.head()

Unnamed: 0,id,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,15289,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,15290,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.488048,0.442866,36.846956
2,15291,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,15292,25.0,0.5,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,15293,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.38886,29.558019


In [7]:
train.isnull().sum()

id                      0
clonesize               0
honeybee                0
bumbles                 0
andrena                 0
osmia                   0
MaxOfUpperTRange        0
MinOfUpperTRange        0
AverageOfUpperTRange    0
MaxOfLowerTRange        0
MinOfLowerTRange        0
AverageOfLowerTRange    0
RainingDays             0
AverageRainingDays      0
fruitset                0
fruitmass               0
seeds                   0
yield                   0
dtype: int64

In [8]:
non_numeric = train.select_dtypes(include=['object']).columns
non_numeric

Index([], dtype='object')

In [9]:
X = train.drop(columns=['id', 'yield'])
y = train['yield']

In [10]:
X_test = test.drop(columns=['id'])

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

In [13]:
lr = LinearRegression()
lr_scores = -cross_val_score(lr, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
lr_scores.mean()

np.float64(372.70994440595047)

In [14]:
ridge = Ridge(alpha=1.0)
ridge_scores = -cross_val_score(ridge, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
ridge_scores.mean()

np.float64(372.0909033440332)

In [15]:
lasso = Lasso(alpha=1.0, max_iter=10000)
lasso_scores = -cross_val_score(lasso, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
lasso_scores.mean()

np.float64(371.94906881047893)

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_model = make_pipeline(PolynomialFeatures(2), LinearRegression())
poly_scores = -cross_val_score(poly_model, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')

poly_scores.mean()

np.float64(383.4521327606315)

I selected Ridge Regression as the final model for predicting blueberry yield because it provided a strong balance between bias and variance. While Lasso Regression gave slightly better performance on cross-validation, I preferred Ridge for its ability to regularize without discarding potentially important predictors. Cross-validation confirmed that Ridge offered competitive predictive performance with stable results across folds.

In [17]:
final_model = Ridge(alpha=1.0)
final_model.fit(X_scaled, y)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [18]:
test_preds = final_model.predict(X_test_scaled)

In [19]:
submission = pd.DataFrame({
    'id': test['id'],
    'yield': test_preds
})
submission.to_csv("submission.csv", index=False)