Dataset loading,Preprocessing and splitting

In [None]:
# preprocessing.py 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

# Load dataset
df = pd.read_csv('crop_yield.csv')

# Define features and target
X = df.drop('Yield_tons_per_hectare', axis=1)
y = df['Yield_tons_per_hectare']

# Identify categorical features
categorical_features = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough'  
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save processed training data & preprocessor
joblib.dump((X_train, y_train, preprocessor), 'train_data.pkl')

# Save test data for evaluation
joblib.dump((X_test, y_test), 'test_data.pkl')

print("✅ Data preprocessed and saved successfully!")


✅ Data preprocessed and saved successfully!


Model Traning and Saving

In [None]:
# train_model.py 
from xgboost import XGBRegressor
import joblib
from sklearn.pipeline import Pipeline

# Load preprocessed training data
X_train, y_train, preprocessor = joblib.load('train_data.pkl')

# Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])

# Train model
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'crop_yield_model.pkl')
print("✅ Model trained and saved successfully!")


✅ Model trained and saved successfully!


Model Evalution

In [None]:
# evaluate.py 
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load test data
X_test, y_test = joblib.load('test_data.pkl')

# Load trained model
model = joblib.load('crop_yield_model.pkl')

# Predictions
y_pred = model.predict(X_test)

# Metrics
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")


MAE: 0.40
MSE: 0.25
RMSE: 0.50
R²: 0.91


