<a href="https://colab.research.google.com/github/kashishnarwal/Week-1/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv("/content/cleaned_ag_data.csv")

In [4]:
numeric_features = ["Area_ha","Rainfall_mm","Fertilizer_kg_ha","Tmin_C","Tmax_C","SOC_pct","Year"]
categorical_features = ["State","District","Season","Crop"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

data["Yield_t_ha"] = data["Production_t"] / data["Area_ha"]
X = data[numeric_features + categorical_features]
y = data["Yield_t_ha"]

mask = y.notna()
X_train, X_test, y_train, y_test = train_test_split(X[mask], y[mask], test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, random_state=42))
])

In [5]:
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="r2")
model.fit(X_train, y_train)
r2_test = model.score(X_test, y_test)

print("Cross-validation R²:", cv_scores.mean())
print("Test R²:", r2_test)

with open("trained_model.pkl", "wb") as f:
    pickle.dump(model, f)

Cross-validation R²: 0.9661537336515416
Test R²: 0.983754552223419


In [9]:
import pandas as pd
import pickle

feature_cols = [col for col in data.columns if col not in ["Yield_t_ha", "Production_t"]]
X = data[feature_cols]

with open("trained_model.pkl", "rb") as f:
    model = pickle.load(f)

predictions = model.predict(X[:10])   # first 10 rows as example
print("Predicted yields (t/ha):", predictions)

Predicted yields (t/ha): [ 2.46890634  0.99425195  2.70778312  1.24955266  1.71501042  1.21878248
 12.34869417  1.80020441  2.42700096  2.30181053]


In [10]:
results = data.head(10).copy()
results["Predicted_Yield"] = predictions
print(results[["State","District","Crop","Season","Yield_t_ha","Predicted_Yield"]])


            State     District       Crop  Season  Yield_t_ha  Predicted_Yield
0   Uttar Pradesh   District_1  Groundnut    Rabi    2.794939         2.468906
1  Madhya Pradesh   District_2     Pulses    Rabi    0.924709         0.994252
2     West Bengal   District_3      Maize    Rabi    2.626560         2.707783
3      Tamil Nadu   District_4    Mustard    Rabi    1.180864         1.249553
4          Punjab   District_5  Groundnut  Kharif    1.726402         1.715010
5   Uttar Pradesh   District_6    Mustard  Kharif    1.236820         1.218782
6     Maharashtra   District_7  Sugarcane    Rabi   12.280034        12.348694
7       Rajasthan   District_8     Barley  Kharif    1.787746         1.800204
8         Haryana   District_9     Cotton  Kharif    2.535093         2.427001
9     West Bengal  District_10     Barley  Kharif    2.433772         2.301811
