In [21]:
# rice_yield_train_predict.py

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import joblib

# Paths
DATA_PATH = r"..\..\data\processed\rice_data.csv"
OUTPUT_DIR = r"..\..\outputs"
MODEL_DIR = r"..\..\models"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)


In [22]:

# ---------------------------
# 1. Load Data
# ---------------------------
df = pd.read_csv(DATA_PATH)
df['Year'] = df['Year'].astype(int)
print("✅ Dataset Loaded:", df.shape)


✅ Dataset Loaded: (33, 7)


In [23]:

# ---------------------------
# 2. Check for Missing
# ---------------------------
print("🔍 Missing Values:\n", df.isnull().sum())


🔍 Missing Values:
 Area                             0
Item                             0
Year                             0
hg/ha_yield                      0
avg_temp                         0
average_rain_fall_mm_per_year    0
pesticides_tonnes                0
dtype: int64


In [8]:

# ---------------------------
# 2. Check Missing Values
# ---------------------------

print("🔍 Missing Values:\n", df.isnull().sum())


🔍 Missing Values:
 Area                             0
Item                             0
Year                             0
hg/ha_yield                      0
avg_temp                         0
average_rain_fall_mm_per_year    0
pesticides_tonnes                0
dtype: int64


In [None]:

# ---------------------------
# 3. EDA Plots (Optional)
# ---------------------------
plt.figure(figsize=(10, 5))
sns.lineplot(data=df, x='Year', y='hg/ha_yield')
plt.title("Rice Yield Over Years in Nepal")
plt.ylabel("Yield (hg/ha)")
plt.grid()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'rice_yield_trend.png'))
plt.close()


# Correlation Matrix
plt.figure(figsize=(8, 6))
corr = df[['hg/ha_yield', 'avg_temp', 'average_rain_fall_mm_per_year', 'pesticides_tonnes']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'correlation_matrix.png'))
plt.close()


In [27]:
# ---------------------------
# 4. Model Training
# ---------------------------
features = ['avg_temp', 'average_rain_fall_mm_per_year', 'pesticides_tonnes']
target = 'hg/ha_yield'

df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
}

results = {}


for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    results[name] = {'model': model, 'rmse': rmse}
    print(f"{name} RMSE: {rmse:.2f}")

# Save best model
best_model_name = min(results, key=lambda name: results[name]['rmse'])
best_model = results[best_model_name]['model']
joblib.dump(best_model, os.path.join(MODEL_DIR, 'best_rice_yield_model.joblib'))

print(f"\n✅ Best Model: {best_model_name} (RMSE: {results[best_model_name]['rmse']:.2f})")



Linear Regression RMSE: 2422.12
Random Forest RMSE: 1860.08
XGBoost RMSE: 2707.26

✅ Best Model: Random Forest (RMSE: 1860.08)


In [30]:

# ---------------------------
# 5. Predict for 2025–2035 (Only If Data Exists)
# ---------------------------

# Filter actual future data
future_df = df[df['Year'].between(2025, 2035)].copy()

if future_df.empty:
    print("⚠️ No actual data found for 2025–2035. Skipping prediction.")
else:
    X_future = future_df[features].astype(float)
    future_df['Predicted_Yield'] = best_model.predict(X_future)

    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(future_df['Year'], future_df['Predicted_Yield'], marker='o', linestyle='-', color='green', label='Projected Yield')
    plt.title('Projected Rice Yield (2025–2035) – From Actual Dataset')
    plt.xlabel('Year')
    plt.ylabel('Yield (hg/ha)')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'future_yield_from_actual_data.png'))
    plt.close()

    # Save CSV
    future_df.to_csv(os.path.join(OUTPUT_DIR, 'predicted_yield_2025_2035.csv'), index=False)

    print("✅ Future prediction complete. Output saved.")

⚠️ No actual data found for 2025–2035. Skipping prediction.
