# Crop Yield Prediction - ML Pipeline Notebook

This notebook integrates data preprocessing, model training, and evaluation for crop yield prediction.

**Objective**: Predict crop yield and recommend the most profitable crops based on environmental factors.

## 1. Environment Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

## 2. Data Loading & Preprocessing

In [None]:
def load_and_merge_data(base_path='/home/lithium/P12/Crop Yield Prediction Dataset'):
    """
    Loads raw CSV files, cleans them, and merges them into a single dataset.
    """
    # Load Datasets
    print("Loading datasets...")
    yield_df = pd.read_csv(os.path.join(base_path, "yield.csv"))
    rain_df = pd.read_csv(os.path.join(base_path, "rainfall.csv"))
    temp_df = pd.read_csv(os.path.join(base_path, "temp.csv"))
    pest_df = pd.read_csv(os.path.join(base_path, "pesticides.csv"))

    # --- Cleaning & Renaming ---
    # 1. Yield Data
    yield_df = yield_df[['Area', 'Item', 'Year', 'Value']].rename(columns={'Value': 'Yield_hg_ha'})

    # 2. Rainfall Data
    rain_df.columns = [col.strip() for col in rain_df.columns]
    rain_df = rain_df.rename(columns={'average_rain_fall_mm_per_year': 'avg_rainfall_mm'})
    rain_df['avg_rainfall_mm'] = pd.to_numeric(rain_df['avg_rainfall_mm'], errors='coerce')

    # 3. Temperature Data
    temp_df = temp_df.rename(columns={'year': 'Year', 'country': 'Area', 'avg_temp': 'avg_temp_c'})

    # 4. Pesticides Data
    pest_df = pest_df[['Area', 'Year', 'Value']].rename(columns={'Value': 'Pesticides_tonnes'})

    # --- Merging ---
    print("Merging datasets...")
    merged_df = pd.merge(yield_df, rain_df, on=['Area', 'Year'], how='inner')
    merged_df = pd.merge(merged_df, pest_df, on=['Area', 'Year'], how='inner')
    merged_df = pd.merge(merged_df, temp_df, on=['Area', 'Year'], how='inner')

    # --- Final Cleaning ---
    initial_len = len(merged_df)
    merged_df = merged_df.dropna()
    print(f"Dropped {initial_len - len(merged_df)} rows with missing values.")

    print(f"Final shape: {merged_df.shape}")
    print(f"\nFirst few rows:")
    print(merged_df.head())
    
    return merged_df

# Load data (adjust path if needed)
df = load_and_merge_data()

In [None]:
# Display dataset info
print("Dataset Info:")
print(df.info())
print(f"\nDataset Statistics:")
print(df.describe())
print(f"\nUnique crops: {df['Item'].nunique()}")
print(f"Unique areas: {df['Area'].nunique()}")

## 3. Data Preparation for Modeling

In [None]:
# Prepare features and target
X = df.drop(columns=['Yield_hg_ha', 'Year'])
y = df['Yield_hg_ha']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nFeatures: {X_train.columns.tolist()}")

In [None]:
# Define categorical and numerical features
categorical_features = ['Area', 'Item']
numerical_features = ['avg_rainfall_mm', 'avg_temp_c', 'Pesticides_tonnes']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

print("Preprocessor created successfully!")

## 4. Custom Metrics

In [None]:
def calculate_profitability(y_true, y_pred, pesticides):
    """
    Custom metric: Profitability Proxy.
    Assume Price per unit yield = 200
    Assume Cost per unit pesticide = 10
    Profit = (Yield * 200) - (Pesticides * 10)
    """
    price = 200
    cost = 10
    actual_profit = (y_true * price) - (pesticides * cost)
    pred_profit = (y_pred * price) - (pesticides * cost)
    return np.mean(np.abs(actual_profit - pred_profit))

print("Profitability metric defined!")

## 5. Model Training - Ridge Regression (Baseline)

In [None]:
print("Training Ridge Regression (Baseline)...")

pipeline_ridge = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

param_grid_ridge = {'regressor__alpha': [0.1, 1.0, 10.0]}
grid_ridge = GridSearchCV(pipeline_ridge, param_grid_ridge, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_ridge.fit(X_train, y_train)

best_ridge = grid_ridge.best_estimator_
print(f"Best Ridge parameters: {grid_ridge.best_params_}")

y_pred_ridge = best_ridge.predict(X_test)

# Calculate metrics
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
profit_error_ridge = calculate_profitability(y_test, y_pred_ridge, X_test['Pesticides_tonnes'].values)

print(f"\nRidge Regression Metrics:")
print(f"  RMSE: {rmse_ridge:.4f}")
print(f"  MAE: {mae_ridge:.4f}")
print(f"  R² Score: {r2_ridge:.4f}")
print(f"  Profitability Error: {profit_error_ridge:.2f}")

## 6. Model Training - Random Forest (Challenger)

In [None]:
print("Training Random Forest (Challenger)...")

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

param_grid_rf = {
    'regressor__n_estimators': [10, 20],
    'regressor__max_depth': [5, 10]
}

grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=2, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
print(f"Best Random Forest parameters: {grid_rf.best_params_}")

y_pred_rf = best_rf.predict(X_test)

# Calculate metrics
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
profit_error_rf = calculate_profitability(y_test, y_pred_rf, X_test['Pesticides_tonnes'].values)

print(f"\nRandom Forest Metrics:")
print(f"  RMSE: {rmse_rf:.4f}")
print(f"  MAE: {mae_rf:.4f}")
print(f"  R² Score: {r2_rf:.4f}")
print(f"  Profitability Error: {profit_error_rf:.2f}")

## 7. Model Comparison

In [None]:
# Compare models
comparison = pd.DataFrame({
    'Model': ['Ridge Regression', 'Random Forest'],
    'RMSE': [rmse_ridge, rmse_rf],
    'MAE': [mae_ridge, mae_rf],
    'R² Score': [r2_ridge, r2_rf],
    'Profitability Error': [profit_error_ridge, profit_error_rf]
})

print("\n=== Model Comparison ===")
print(comparison.to_string(index=False))

# Select best model
best_model = best_rf if r2_rf > r2_ridge else best_ridge
best_model_name = 'Random Forest' if r2_rf > r2_ridge else 'Ridge Regression'
print(f"\n✓ Best Model Selected: {best_model_name}")

## 8. Save Best Model

In [None]:
# Create models directory if it doesn't exist
os.makedirs('/home/lithium/P12/models', exist_ok=True)

# Save the best model
model_path = '/home/lithium/P12/models/best_model.pkl'
joblib.dump(best_model, model_path)
print(f"Best model saved to {model_path}")

## 9. Prediction Example

In [None]:
# Example: Make a single prediction
example_input = pd.DataFrame({
    'Area': ['India'],
    'Item': ['Maize'],
    'avg_rainfall_mm': [1200.0],
    'avg_temp_c': [25.0],
    'Pesticides_tonnes': [100.0]
})

predicted_yield = best_model.predict(example_input)[0]
print(f"Example Prediction:")
print(f"  Area: India")
print(f"  Crop: Maize")
print(f"  Rainfall: 1200 mm")
print(f"  Temperature: 25°C")
print(f"  Pesticides: 100 tonnes")
print(f"  Predicted Yield: {predicted_yield:.2f} hg/ha")

## 10. Crop Recommendations Function

In [None]:
def recommend_crops(area, rainfall, temp, pesticides, model, data, top_n=3):
    """
    Recommend top N most profitable crops for given environmental conditions.
    """
    unique_crops = data['Item'].unique().tolist()
    recommendations = []
    
    price = 200  # Assumed price per unit yield
    cost = 10    # Assumed cost per unit pesticide
    
    for crop in unique_crops:
        input_data = pd.DataFrame({
            'Area': [area],
            'Item': [crop],
            'avg_rainfall_mm': [rainfall],
            'avg_temp_c': [temp],
            'Pesticides_tonnes': [pesticides]
        })
        
        try:
            predicted_yield = model.predict(input_data)[0]
            profit = (predicted_yield * price) - (pesticides * cost)
            recommendations.append({
                'crop': crop,
                'predicted_yield': predicted_yield,
                'profitability': profit
            })
        except:
            continue
    
    # Sort by profitability
    recommendations.sort(key=lambda x: x['profitability'], reverse=True)
    return recommendations[:top_n]

# Example recommendations
recs = recommend_crops('India', 1200, 25, 100, best_model, df)
print("\nTop 3 Recommended Crops for India (Rainfall: 1200mm, Temp: 25°C, Pesticides: 100 tonnes):")
for i, rec in enumerate(recs, 1):
    print(f"  {i}. {rec['crop']}")
    print(f"     Predicted Yield: {rec['predicted_yield']:.2f} hg/ha")
    print(f"     Estimated Profitability: ${rec['profitability']:.2f}")

## 11. Feature Importance (Random Forest)

In [None]:
# Extract feature importance from Random Forest
if hasattr(best_rf.named_steps['regressor'], 'feature_importances_'):
    # Get the feature names after preprocessing
    # This is a simplified view; actual feature names depend on encoder output
    print("\nTop Features by Importance (Random Forest):")
    importances = best_rf.named_steps['regressor'].feature_importances_
    top_indices = np.argsort(importances)[-5:][::-1]
    for idx in top_indices:
        print(f"  Feature {idx}: {importances[idx]:.4f}")
else:
    print("Feature importance not available for this model.")

## 12. todo

- Deploy the model using the FastAPI backend (`app.py`)
- Launch the Streamlit frontend for interactive predictions
- Integrate MLflow for comprehensive experiment tracking
- Consider hyperparameter tuning for improved performance