<a href="https://colab.research.google.com/github/harshitha-gokulraj/Climate-Smart-Crop-Selection-/blob/main/Sustainable_agriculture_jpynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**SET UP ENVIRONMENT**

In [None]:
!pip install numpy pandas geopandas rasterio shapely pyproj scikit-learn lightgbm xgboost catboost pyarrow matplotlib plotly streamlit joblib

Collecting rasterio
  Downloading rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting streamlit
  Downloading streamlit-1.49.0-py3-none-any.whl.metadata (9.5 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.3/22.3 MB[0m [31m114.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.8-cp312-cp31

****

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, label_ranking_average_precision_score
from lightgbm import LGBMClassifier
import joblib


**Dummy Dataset (for testing pipeline)**

In [None]:
# Create a dummy dataset
np.random.seed(42)
N = 500

data = pd.DataFrame({
    "tmean": np.random.uniform(15, 35, N),       # Mean temperature (°C)
    "rain": np.random.uniform(50, 400, N),       # Rainfall (mm/month)
    "pet": np.random.uniform(100, 250, N),       # PET
    "ph": np.random.uniform(4.5, 8.5, N),        # Soil pH
    "oc": np.random.uniform(0.2, 2.0, N),        # Organic carbon %
    "season": np.random.choice(["kharif","rabi","summer"], N)
})

# Target crops (multi-class for demo)
crops = ["rice","wheat","maize","cotton","soybean"]
data["crop"] = np.random.choice(crops, N)

# Binary suitability (for demo, 70% suitable)
data["suitability"] = np.random.choice([0,1], size=N, p=[0.3,0.7])

data.head()


Unnamed: 0,tmean,rain,pet,ph,oc,season,crop,suitability
0,22.490802,294.3566,127.769939,6.576327,0.67107,summer,soybean,1
1,34.014286,237.633728,181.285142,6.416728,0.644562,summer,rice,0
2,29.639879,158.334666,230.941875,4.602568,1.831258,summer,maize,1
3,26.97317,334.828257,209.833733,5.864991,0.649183,summer,cotton,1
4,18.120373,289.65591,220.984172,6.020782,0.68951,summer,cotton,0


**Feature Engineering**

In [None]:
def make_features(df):
    out = df.copy()
    out["rain_cv"] = np.random.uniform(0.1, 0.5, len(df))  # Dummy variability
    out["aridity"] = (out["pet"] / (out["rain"]+1e-6)).clip(0,10)
    out = pd.get_dummies(out, columns=["season"])
    return out

features = make_features(data)
X = features.drop(columns=["crop","suitability"])
y = data["suitability"]


**Model Training**

In [10]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
from sklearn.datasets import load_breast_cancer  # example dataset

# Example dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X))

# Model
clf = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Cross-validation
for tr, va in skf.split(X, y):
    clf.fit(
        X.iloc[tr], y.iloc[tr],
        eval_set=[(X.iloc[va], y.iloc[va])],
        eval_metric="average_precision",
        callbacks=[lgb.log_evaluation(0)]  # suppress logs
    )
    oof[va] = clf.predict_proba(X.iloc[va])[:, 1]

# Final score
ap = average_precision_score(y, oof)
print("Average Precision Score:", ap)


[LightGBM] [Info] Number of positive: 286, number of negative: 169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4536
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628571 -> initscore=0.526093
[LightGBM] [Info] Start training from score 0.526093
[LightGBM] [Info] Number of positive: 286, number of negative: 169
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4548
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628571 -> initscore=0.526093
[LightGBM] [Info]

**Inference – Recommend Crops**

In [13]:
def recommend(sample_features: pd.DataFrame, model):
    score = model.predict_proba(
        sample_features,
        predict_disable_shape_check=True
    )[:, 1]
    return score

# Example input (replace values with real soil & climate data)
sample = pd.DataFrame([{
    "tmean": 27, "rain": 250, "pet": 180, "ph": 6.5, "oc": 1.0,
    "season_kharif": 1, "season_rabi": 0, "season_summer": 0,
    "rain_cv": 0.3, "aridity": 0.72
}])

print("Suitability Score:", recommend(sample, clf))


Suitability Score: [0.99999947]
