In [13]:
# Data manipulation
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
# Replace path
df = pd.read_csv('C:/Users/hafee/Downloads/crop_yield.csv')

# Overview
# print(df.head())
# print(df.info())
# print(df.describe())

# Check missing values
# print(df.isnull().sum())



In [15]:
# Target
y = df['Yield_tons_per_hectare']

# Features
X = df.drop(columns=['Yield_tons_per_hectare'])

In [16]:
# Categorical & Numerical Columns
categorical_features = ['Region', 'Soil_Type', 'Crop', 'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition']
numerical_features = ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest']

In [18]:
# Preprocessing
# Numerical features: scale
numeric_transformer = StandardScaler()

# Categorical features: one-hot encode
categorical_transformer = OneHotEncoder(drop='first', sparse=False)

# Combine both using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [None]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Build model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ))
])

#Train model
model.fit(X_train, y_train)


In [None]:
# Evaluate and PLot

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.3)
plt.xlabel('Actual Yield (tons/ha)')
plt.ylabel('Predicted Yield (tons/ha)')
plt.title('Actual vs Predicted Crop Yield')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()