<a href="https://colab.research.google.com/github/kashishnarwal/Week-1/blob/main/Week_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

raw_path = "/content/sustainable_ag_starter.csv"
data = pd.read_csv(raw_path)

In [5]:
numeric_features = ["Area_ha","Rainfall_mm","Fertilizer_kg_ha",
                    "Tmin_C","Tmax_C","SOC_pct","Year"]
categorical_features = ["State","District","Season","Crop"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Outlier handling
def iqr_clip(series, factor=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - factor*iqr
    upper = q3 + factor*iqr
    return series.clip(lower, upper)

clean = data.copy()
for col in ["Area_ha","Production_t","Yield_t_ha"]:
    clean[col] = iqr_clip(clean[col])

# Feature engineering

clean["GDD"] = ((clean["Tmax_C"] + clean["Tmin_C"]) / 2 - 10).clip(lower=0) * 120/10
clean["Rainfall_Anom"] = clean["Rainfall_mm"] - clean["Rainfall_mm"].median()

target = "Yield_t_ha"
feature_cols = numeric_features + ["GDD","Rainfall_Anom"] + categorical_features

X = clean[feature_cols]
y = clean[target]

# Modeling
model = RandomForestRegressor(n_estimators=300, random_state=42)
pipe = Pipeline(steps=[("preprocessor", preprocessor),
                      ("model", model)])

mask = y.notna()
X_train, X_test, y_train, y_test = train_test_split(
    X[mask], y[mask], test_size=0.2, random_state=42
)

cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="r2")
pipe.fit(X_train, y_train)
r2_test = pipe.score(X_test, y_test)

print("Cross-val R² mean:", cv_scores.mean())
print("Test R²:", r2_test)

clean_path = "cleaned_ag_data.csv"
pipe_path = "preprocessing_pipeline.pkl"

clean.to_csv(clean_path, index=False)
with open(pipe_path, "wb") as f:
    pickle.dump(pipe, f)

print("Cleaned data saved at:", clean_path)
print("Pipeline saved at:", pipe_path)


Cross-val R² mean: 0.9587479123023247
Test R²: 0.9461489686358847
Cleaned data saved at: cleaned_ag_data.csv
Pipeline saved at: preprocessing_pipeline.pkl
