In [45]:
import pandas as pd

df=pd.read_csv("data/yield_df.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [46]:
df=df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [47]:
df.shape

(28242, 7)

In [48]:
len(df.Area.value_counts())

101

In [49]:
len(df.Item.value_counts())

10

In [50]:
import numpy as np

X = df.drop('hg/ha_yield', axis=1)
y = df['hg/ha_yield'].values.astype(np.float32)

In [51]:
categorical_cols = ['Area', 'Item']
numerical_cols = ['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

#### Important Note: The code snippets below are used to convert sparse matrix into dense array

In [53]:
X_processed = preprocessor.fit_transform(X)

# Convert sparse matrix to dense array (OneHotEncoder outputs sparse arrays by default)
if hasattr(X_processed, "toarray"):
    X_processed = X_processed.toarray()

In [54]:
X_processed.shape

(28242, 115)

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [56]:
from torch.utils.data import Dataset,DataLoader
import torch

class CropDataset(Dataset):
    def __init__(self, X, y):
        # Convert numpy arrays to PyTorch tensors
        self.X = torch.tensor(X, dtype=torch.float32)
        # Reshape y to be a column vector: (n_samples, 1)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [57]:
train_dataset = CropDataset(X_train, y_train)
test_dataset = CropDataset(X_test, y_test)


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [58]:
from torch import nn

class YieldPredictorMLP(nn.Module):
    def __init__(self, input_size):
        super(YieldPredictorMLP, self).__init__()
        
        # Define the layers
        self.hidden1 = nn.Linear(input_size, 128)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        
        self.hidden2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        
        # Output layer produces a single continuous value
        self.output = nn.Linear(64, 1)

    def forward(self, x):
        # Pass data through the layers
        x = self.hidden1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.hidden2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.output(x)
        return x

In [59]:
input_dim = X_train.shape[1]
model = YieldPredictorMLP(input_size=input_dim)

In [60]:
from torch import optim
# Define the Loss function (Mean Squared Error) and Optimizer (Adam)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [61]:
from tqdm import tqdm

num_epochs = 500

for epoch in tqdm(range(num_epochs)):
    model.train() 
    train_loss = 0.0
    
    for batch_X, batch_y in train_loader:
            
        optimizer.zero_grad()
        
        
        predictions = model(batch_X)
        
        loss = criterion(predictions, batch_y)
        
        loss.backward()
        
        optimizer.step()
        
        train_loss += loss.item() * batch_X.size(0)
        
    train_loss = train_loss / len(train_loader.dataset)
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss (MSE): {train_loss:.4f}")

  2%|▏         | 10/500 [00:10<08:16,  1.01s/it]

Epoch [10/500], Loss (MSE): 1507436967.2107


  4%|▍         | 20/500 [00:21<10:44,  1.34s/it]

Epoch [20/500], Loss (MSE): 1065402559.2210


  6%|▌         | 30/500 [00:38<13:38,  1.74s/it]

Epoch [30/500], Loss (MSE): 913026996.1478


  8%|▊         | 40/500 [01:05<23:19,  3.04s/it]

Epoch [40/500], Loss (MSE): 829546117.2287


 10%|█         | 50/500 [01:36<21:01,  2.80s/it]

Epoch [50/500], Loss (MSE): 745155269.9206


 12%|█▏        | 60/500 [02:08<23:29,  3.20s/it]

Epoch [60/500], Loss (MSE): 649210188.0479


 14%|█▍        | 70/500 [02:37<22:22,  3.12s/it]

Epoch [70/500], Loss (MSE): 538747120.5177


 16%|█▌        | 80/500 [03:08<20:34,  2.94s/it]

Epoch [80/500], Loss (MSE): 474187525.7439


 18%|█▊        | 90/500 [03:42<22:29,  3.29s/it]

Epoch [90/500], Loss (MSE): 436029941.2646


 20%|██        | 100/500 [04:15<20:33,  3.08s/it]

Epoch [100/500], Loss (MSE): 407458017.3378


 22%|██▏       | 110/500 [04:46<19:30,  3.00s/it]

Epoch [110/500], Loss (MSE): 385292150.5944


 24%|██▍       | 120/500 [05:19<19:08,  3.02s/it]

Epoch [120/500], Loss (MSE): 372890769.3307


 26%|██▌       | 130/500 [05:49<18:28,  3.00s/it]

Epoch [130/500], Loss (MSE): 358794047.0620


 28%|██▊       | 140/500 [06:25<21:33,  3.59s/it]

Epoch [140/500], Loss (MSE): 357856638.0862


 30%|███       | 150/500 [06:57<19:11,  3.29s/it]

Epoch [150/500], Loss (MSE): 336626418.7938


 32%|███▏      | 160/500 [07:29<18:01,  3.18s/it]

Epoch [160/500], Loss (MSE): 337200946.7343


 34%|███▍      | 170/500 [08:02<17:31,  3.19s/it]

Epoch [170/500], Loss (MSE): 328526107.6758


 36%|███▌      | 180/500 [08:35<19:00,  3.56s/it]

Epoch [180/500], Loss (MSE): 326452333.1365


 38%|███▊      | 190/500 [09:08<16:36,  3.22s/it]

Epoch [190/500], Loss (MSE): 321331206.1268


 40%|████      | 200/500 [09:42<14:50,  2.97s/it]

Epoch [200/500], Loss (MSE): 317492128.6742


 42%|████▏     | 210/500 [10:14<15:18,  3.17s/it]

Epoch [210/500], Loss (MSE): 317159990.1660


 44%|████▍     | 220/500 [10:51<16:57,  3.63s/it]

Epoch [220/500], Loss (MSE): 305643683.7974


 46%|████▌     | 230/500 [11:27<17:25,  3.87s/it]

Epoch [230/500], Loss (MSE): 319643934.7097


 48%|████▊     | 240/500 [12:01<15:30,  3.58s/it]

Epoch [240/500], Loss (MSE): 311613798.6017


 50%|█████     | 250/500 [12:29<11:39,  2.80s/it]

Epoch [250/500], Loss (MSE): 305792697.9803


 52%|█████▏    | 260/500 [13:02<13:37,  3.40s/it]

Epoch [260/500], Loss (MSE): 302906271.5900


 54%|█████▍    | 270/500 [13:35<11:41,  3.05s/it]

Epoch [270/500], Loss (MSE): 301297793.9651


 56%|█████▌    | 280/500 [14:07<11:35,  3.16s/it]

Epoch [280/500], Loss (MSE): 296679394.2733


 58%|█████▊    | 290/500 [14:40<11:22,  3.25s/it]

Epoch [290/500], Loss (MSE): 303513336.5863


 60%|██████    | 300/500 [15:17<10:35,  3.18s/it]

Epoch [300/500], Loss (MSE): 298627851.4103


 62%|██████▏   | 310/500 [15:55<13:03,  4.12s/it]

Epoch [310/500], Loss (MSE): 297035586.3755


 64%|██████▍   | 320/500 [16:33<10:22,  3.46s/it]

Epoch [320/500], Loss (MSE): 288788984.6712


 66%|██████▌   | 330/500 [17:07<10:46,  3.81s/it]

Epoch [330/500], Loss (MSE): 288886747.4492


 68%|██████▊   | 340/500 [17:44<10:29,  3.93s/it]

Epoch [340/500], Loss (MSE): 289205212.8925


 70%|███████   | 350/500 [18:14<07:12,  2.88s/it]

Epoch [350/500], Loss (MSE): 289697732.1358


 72%|███████▏  | 360/500 [18:46<08:19,  3.57s/it]

Epoch [360/500], Loss (MSE): 279696023.1452


 74%|███████▍  | 370/500 [19:22<07:56,  3.66s/it]

Epoch [370/500], Loss (MSE): 284833050.7181


 76%|███████▌  | 380/500 [19:56<06:32,  3.27s/it]

Epoch [380/500], Loss (MSE): 288167625.4613


 78%|███████▊  | 390/500 [20:27<05:09,  2.81s/it]

Epoch [390/500], Loss (MSE): 274063179.2148


 80%|████████  | 400/500 [20:59<05:41,  3.41s/it]

Epoch [400/500], Loss (MSE): 281360517.6209


 82%|████████▏ | 410/500 [21:25<03:43,  2.49s/it]

Epoch [410/500], Loss (MSE): 267774391.8284


 84%|████████▍ | 420/500 [21:50<03:16,  2.46s/it]

Epoch [420/500], Loss (MSE): 275323839.8901


 86%|████████▌ | 430/500 [22:15<02:51,  2.45s/it]

Epoch [430/500], Loss (MSE): 276026425.3771


 88%|████████▊ | 440/500 [22:38<02:24,  2.41s/it]

Epoch [440/500], Loss (MSE): 267943918.9444


 90%|█████████ | 450/500 [23:00<01:50,  2.22s/it]

Epoch [450/500], Loss (MSE): 278750958.0675


 92%|█████████▏| 460/500 [23:22<01:32,  2.31s/it]

Epoch [460/500], Loss (MSE): 268814922.7653


 94%|█████████▍| 470/500 [23:44<01:04,  2.15s/it]

Epoch [470/500], Loss (MSE): 270182771.7591


 96%|█████████▌| 480/500 [24:07<00:43,  2.17s/it]

Epoch [480/500], Loss (MSE): 272234216.1281


 98%|█████████▊| 490/500 [24:32<00:24,  2.50s/it]

Epoch [490/500], Loss (MSE): 270850063.9335


100%|██████████| 500/500 [24:55<00:00,  2.99s/it]

Epoch [500/500], Loss (MSE): 263647505.3466





In [62]:
model.eval()
test_loss = 0.0
all_preds = []
all_targets = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)
        test_loss += loss.item() * batch_X.size(0)

        all_preds.append(predictions.squeeze(1).cpu().numpy())
        all_targets.append(batch_y.squeeze(1).cpu().numpy())

test_loss = test_loss / len(test_loader.dataset)

y_true = np.concatenate(all_targets)
y_pred = np.concatenate(all_preds)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
mae = float(mean_absolute_error(y_true, y_pred))
r2 = float(r2_score(y_true, y_pred))

print(f"\nFinal Test Loss (MSE): {test_loss:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")


Final Test Loss (MSE): 159496958.9110
RMSE: 12629.2106
MAE: 6356.4722
R² Score: 0.9780


In [63]:
def predict_yield(model, preprocessor, area, item, year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp):
    user_df = pd.DataFrame([
        {
            "Area": area,
            "Item": item,
            "Year": int(year),
            "average_rain_fall_mm_per_year": float(average_rain_fall_mm_per_year),
            "pesticides_tonnes": float(pesticides_tonnes),
            "avg_temp": float(avg_temp),
        }
    ])

    user_processed = preprocessor.transform(user_df)
    if hasattr(user_processed, "toarray"):
        user_processed = user_processed.toarray()

    user_tensor = torch.tensor(user_processed, dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        prediction = model(user_tensor).item()

    return prediction


def predict_from_user_input(model, preprocessor):
    print("Enter input values for prediction:")

    area = input("Area (country/region): ").strip()
    item = input("Item (crop): ").strip()

    try:
        year = int(input("Year: ").strip())
        average_rain_fall_mm_per_year = float(input("Average rainfall (mm/year): ").strip())
        pesticides_tonnes = float(input("Pesticides (tonnes): ").strip())
        avg_temp = float(input("Average temperature: ").strip())
    except ValueError:
        print("Invalid numeric input. Please enter valid numbers.")
        return None

    prediction = predict_yield(
        model=model,
        preprocessor=preprocessor,
        area=area,
        item=item,
        year=year,
        average_rain_fall_mm_per_year=average_rain_fall_mm_per_year,
        pesticides_tonnes=pesticides_tonnes,
        avg_temp=avg_temp,
    )

    print(f"Predicted yield (hg/ha): {prediction:.2f}")
    return prediction

In [64]:
sample_row = df.iloc[0]

sample_prediction = predict_yield(
    model=model,
    preprocessor=preprocessor,
    area=sample_row["Area"],
    item=sample_row["Item"],
    year=sample_row["Year"],
    average_rain_fall_mm_per_year=sample_row["average_rain_fall_mm_per_year"],
    pesticides_tonnes=sample_row["pesticides_tonnes"],
    avg_temp=sample_row["avg_temp"],
)

print(f"Sample input actual yield: {sample_row['hg/ha_yield']}")
print(f"Sample input predicted yield: {sample_prediction:.2f}")

Sample input actual yield: 36613
Sample input predicted yield: 23518.23
