In [48]:
# Yield Prediction Project (RandomForestRegressor Best Model)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ================= Load and Clean Data =================
df = pd.read_csv('yield_df.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove invalid rainfall values
def isStr(obj):
    try:
        float(obj)
        return False
    except:
        return True
to_drop = df[df['average_rain_fall_mm_per_year'].apply(isStr)].index
df.drop(to_drop, inplace=True)

# Fix dtype
df['average_rain_fall_mm_per_year'] = df['average_rain_fall_mm_per_year'].astype(np.float64)

# Rearrange columns
col = ['Year', 'average_rain_fall_mm_per_year','pesticides_tonnes', 'avg_temp', 'Area', 'Item', 'hg/ha_yield']
df = df[col]

# ================= Train-Test Split =================
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=0, shuffle=True
)

# ================= Preprocessing =================
ohe = OneHotEncoder(drop='first')
scale = StandardScaler()

preprocesser = ColumnTransformer(
    transformers=[
        ('StandardScale', scale, [0, 1, 2, 3]),   # Numerical
        ('OHE', ohe, [4, 5])                      # Categorical
    ],
    remainder='passthrough'
)

X_train_dummy = preprocesser.fit_transform(X_train)
X_test_dummy = preprocesser.transform(X_test)

# ================= Model Training =================
rf = RandomForestRegressor(
    n_estimators=200,   # number of trees
    max_depth=None,     # let it grow deep
    random_state=0,
    n_jobs=-1           # use all cores
)
rf.fit(X_train_dummy, y_train)

# Evaluate
y_pred = rf.predict(X_test_dummy)
print("Random Forest Results:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# ================= Prediction Function =================
# ================= Prediction Function =================
def prediction(Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item):
    # Create DataFrame with correct column names
    features = pd.DataFrame([[Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item]],
                            columns=['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes',
                                     'avg_temp', 'Area', 'Item'])

    transformed_features = preprocesser.transform(features)
    predicted_yield = rf.predict(transformed_features)
    return predicted_yield[0]

# Example test
print("Sample Prediction (clean):", prediction(1990, 1485.0, 121.0, 16.37, "Albania", "Maize"))


# Example test
print("Sample Prediction:", prediction(1990, 1485.0, 121.0, 16.37, "Albania", "Maize"))

# ================= Save Model and Preprocessor =================
pickle.dump(rf, open('rf.pkl','wb'))
pickle.dump(preprocesser, open('preprocessor.pkl','wb'))

import sklearn
print("sklearn version:", sklearn.__version__)


Random Forest Results:
MAE: 3690.3807239251973
R² Score: 0.9879056001600746
Sample Prediction (clean): 33084.53
Sample Prediction: 33084.53
sklearn version: 1.6.1
