## 1. Cleaning Data

In [495]:
import pandas as pd
import numpy as np

In [496]:
# Load dataset
file_path = "./datasets/original_data.csv"
df = pd.read_csv(file_path)

In [497]:
# Columns that should be numeric
numeric_cols = [
    "N", "P", "K",
    "min_temp", "max_temp",
    "min_humidity", "max_humidity",
    "min_ph", "max_ph",
    "min_rainfall", "max_rainfall"
]

# Replace ':not in source' with NaN
df[numeric_cols] = df[numeric_cols].replace("Not in source", np.nan)

# drop source column
df = df.drop(columns=["Source"])

In [498]:
df.head()

Unnamed: 0,N,P,K,min_temp,max_temp,min_humidity,max_humidity,min_ph,max_ph,min_rainfall,max_rainfall,plant
0,80,60,40,24,32,70,90,6.0,7.0,800,2000,Kangkung
1,90,50,50,25,30,60,80,5.5,6.5,700,1500,Kangkung
2,100,40,30,22,28,75,95,6.5,7.5,1000,2500,Kangkung
3,80,60,60,15,20,70,90,6.0,7.0,1200,2500,Kangkung
4,80,30,45,24,32,85,95,6.0,7.0,800,1200,Kangkung


In [499]:
# Convert columns to numeric (force errors to NaN)
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Fill NaN with average value per plant (group-wise mean)
df[numeric_cols] = (
    df.groupby("plant")[numeric_cols]
      .transform(lambda x: x.fillna(x.mean()))
)

# (Optional) If some values are STILL NaN (plant has no data at all),
# fill them with global column mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [500]:
# Check data types
print(df.dtypes)

N               float64
P               float64
K               float64
min_temp        float64
max_temp        float64
min_humidity    float64
max_humidity    float64
min_ph          float64
max_ph          float64
min_rainfall    float64
max_rainfall    float64
plant            object
dtype: object


In [501]:
df.head()

Unnamed: 0,N,P,K,min_temp,max_temp,min_humidity,max_humidity,min_ph,max_ph,min_rainfall,max_rainfall,plant
0,80.0,60.0,40.0,24.0,32.0,70.0,90.0,6.0,7.0,800.0,2000.0,Kangkung
1,90.0,50.0,50.0,25.0,30.0,60.0,80.0,5.5,6.5,700.0,1500.0,Kangkung
2,100.0,40.0,30.0,22.0,28.0,75.0,95.0,6.5,7.5,1000.0,2500.0,Kangkung
3,80.0,60.0,60.0,15.0,20.0,70.0,90.0,6.0,7.0,1200.0,2500.0,Kangkung
4,80.0,30.0,45.0,24.0,32.0,85.0,95.0,6.0,7.0,800.0,1200.0,Kangkung


In [502]:
# Remove if "plant" is in column plant
df = df[~df["plant"].str.contains("plant", case=False, na=False)] 

In [503]:
# Unique plants
unique_plants = df["plant"].unique()
print(f"Unique plants: {unique_plants}")

Unique plants: ['Kangkung' 'Bayam' 'Selada' 'Cabai' 'Daun Bawang' 'Garlic' 'Aloe vera'
 'Basil' 'Bell pepper' "Cat's whiskers" 'Cherry tomato' 'Ginger'
 'Lemongrass' 'Pandan' 'Potato' 'Turmeric' 'Zucchini' 'Celery' 'Pak choy'
 'Shallot']


In [504]:
df.to_csv("datasets/cleaned_crop_data.csv", index=False)

## 2. Make More Data

In [505]:
df = pd.read_csv("datasets/cleaned_crop_data.csv")
df.head()

Unnamed: 0,N,P,K,min_temp,max_temp,min_humidity,max_humidity,min_ph,max_ph,min_rainfall,max_rainfall,plant
0,80.0,60.0,40.0,24.0,32.0,70.0,90.0,6.0,7.0,800.0,2000.0,Kangkung
1,90.0,50.0,50.0,25.0,30.0,60.0,80.0,5.5,6.5,700.0,1500.0,Kangkung
2,100.0,40.0,30.0,22.0,28.0,75.0,95.0,6.5,7.5,1000.0,2500.0,Kangkung
3,80.0,60.0,60.0,15.0,20.0,70.0,90.0,6.0,7.0,1200.0,2500.0,Kangkung
4,80.0,30.0,45.0,24.0,32.0,85.0,95.0,6.0,7.0,800.0,1200.0,Kangkung


In [506]:
minmax_cols = [
    "min_temp", "max_temp",
    "min_humidity", "max_humidity",
    "min_ph", "max_ph",
    "min_rainfall", "max_rainfall"
]

def augment_minmax(df, n_aug=5, noise_level=0.05):
    augmented = []

    for _, row in df.iterrows():
        for _ in range(n_aug):
            new_row = row.copy()
            for col in minmax_cols:
                std = df[col].std()
                if std > 0:
                    noise = np.random.normal(0, noise_level * std)
                    new_row[col] = row[col] + noise

            # Ensure min <= max
            if new_row["min_temp"] > new_row["max_temp"]:
                new_row["min_temp"], new_row["max_temp"] = \
                    new_row["max_temp"], new_row["min_temp"]

            augmented.append(new_row)

    return pd.DataFrame(augmented)


In [507]:
df_aug = pd.concat(
    [df, augment_minmax(df, n_aug=10)],
    ignore_index=True
)

In [508]:
df_aug[minmax_cols] = df_aug[minmax_cols].clip(lower=0)

df_aug["min_ph"] = df_aug["min_ph"].clip(3, 8)
df_aug["max_ph"] = df_aug["max_ph"].clip(3, 8)

In [509]:
df_aug["temp"] = (df_aug["min_temp"] + df_aug["max_temp"]) / 2
df_aug["humidity"] = (df_aug["min_humidity"] + df_aug["max_humidity"]) / 2
df_aug["ph"] = (df_aug["min_ph"] + df_aug["max_ph"]) / 2
df_aug["rainfall"] = (df_aug["min_rainfall"] + df_aug["max_rainfall"]) / 2

In [510]:
def normalize_rainfall(r):
    if r > 600:      # dianggap tahunan
        return r / 12
    return r         # sudah bulanan

df_aug["rainfall"] = df_aug["rainfall"].apply(normalize_rainfall)

In [511]:
# rainfall bulanan realistis
df_aug["rainfall"] = df_aug["rainfall"].clip(0, 500)

In [512]:
df_aug["rainfall"].describe()

count    1386.000000
mean      107.425787
std        91.712717
min         5.449201
25%        60.931819
50%        78.781993
75%       100.685704
max       500.000000
Name: rainfall, dtype: float64

In [513]:
df_final = df_aug.drop(columns=minmax_cols)

In [514]:
df_final.to_csv(
    "datasets/augmented_crop_research_grade.csv",
    index=False,
)

## 3. Create Model Clasifier

In [515]:
df = pd.read_csv("./datasets/augmented_crop_research_grade.csv")

In [516]:
features = ["temp", "humidity", "rainfall"]
target = "plant"

X = df[features]
y = df[target]

In [517]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [518]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

In [519]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [520]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))

Accuracy: 0.9388489208633094
                precision    recall  f1-score   support

     Aloe vera       1.00      1.00      1.00        16
         Basil       0.75      0.43      0.55         7
         Bayam       0.97      0.97      0.97        33
   Bell pepper       1.00      0.73      0.85        15
         Cabai       0.92      1.00      0.96        33
Cat's whiskers       1.00      1.00      1.00         2
        Celery       1.00      0.86      0.92         7
 Cherry tomato       1.00      1.00      1.00         7
   Daun Bawang       1.00      0.85      0.92        13
        Garlic       1.00      1.00      1.00        29
        Ginger       1.00      1.00      1.00         4
      Kangkung       1.00      1.00      1.00        20
    Lemongrass       1.00      1.00      1.00         7
      Pak choy       1.00      1.00      1.00        15
        Pandan       1.00      0.75      0.86         4
        Potato       1.00      1.00      1.00         9
        Selada    

In [521]:
import joblib

joblib.dump(model, "./model/crop_recommendation_model.pkl")
joblib.dump(label_encoder, "./model/crop_recommendation_label_encoder.pkl")

['./model/crop_recommendation_label_encoder.pkl']

## 4. Test Model

In [522]:
# load model from joblib
crop_model = joblib.load("./model/crop_recommendation_model.pkl")
crop_label_encoder = joblib.load("./model/crop_recommendation_label_encoder.pkl")

In [523]:
def predict_plant(temp, humidity, rainfall):
    input_df = pd.DataFrame([{
        "temp": temp,
        "humidity": humidity,
        "rainfall": rainfall
    }])

    pred_encoded = crop_model.predict(input_df)[0]
    pred_plant = crop_label_encoder.inverse_transform([pred_encoded])[0]

    return pred_plant

In [524]:
result = predict_plant(
    temp=27.5,
    humidity=75,
    rainfall=180
)

print("Recommended plant:", result)


Recommended plant: Ginger


In [525]:
# TOP 3 RECOMMENDATIONS
import numpy as np

def predict_top_k(temp, humidity, rainfall, k=3):
    input_df = pd.DataFrame([{
        "temp": temp,
        "humidity": humidity,
        "rainfall": rainfall
    }])

    probs = model.predict_proba(input_df)[0]
    top_idx = np.argsort(probs)[-k:][::-1]

    return list(zip(
        label_encoder.inverse_transform(top_idx),
        probs[top_idx]
    ))


In [526]:
result_top_3 = predict_top_k(24,80,84)

result_top_3

[('Daun Bawang', np.float64(0.575)),
 ('Cabai', np.float64(0.3)),
 ('Selada', np.float64(0.045))]