In [1]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import json

In [26]:

# === 1. Load Dataset ===
df = pd.read_csv('Crop_recommendation_no_rainfall.csv')

print("Dataset head:\n", df.head())
print("\nInformasi Dataset:")
print(df.info())

Dataset head:
     N   P   K  temperature   humidity        ph label
0  90  42  43    20.879744  82.002744  6.502985  rice
1  85  58  41    21.770462  80.319644  7.038096  rice
2  60  55  44    23.004459  82.320763  7.840207  rice
3  74  35  40    26.491096  80.158363  6.980401  rice
4  78  42  42    20.130175  81.604873  7.628473  rice

Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   label        2200 non-null   object 
dtypes: float64(3), int64(3), object(1)
memory usage: 120.4+ KB
None


In [28]:
# === 2. Label Encoding ===
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

In [30]:
# === 3. Feature Scaling ===
X = df.drop(columns=['label'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [32]:
# === 4. Split Data ===
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

In [34]:
# === 5. Train GBM ===
gbm = GradientBoostingClassifier(random_state=42)
params_gbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}
gbm_search = RandomizedSearchCV(gbm, param_distributions=params_gbm, n_iter=5, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
gbm_search.fit(X_train, y_train)
best_gbm = gbm_search.best_estimator_

In [36]:
# === 6. Train LGBM ===
lgbm = LGBMClassifier(random_state=42)
params_lgbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, -1],
    'subsample': [0.8, 1.0]
}
lgbm_search = RandomizedSearchCV(lgbm, param_distributions=params_lgbm, n_iter=5, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
lgbm_search.fit(X_train, y_train)
best_lgbm = lgbm_search.best_estimator_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1077
[LightGBM] [Info] Number of data points in the train set: 1760, number of used features: 6
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] 

In [38]:
# === . Evaluasi dan Pilih Model Terbaik ===
acc_gbm = accuracy_score(y_test, best_gbm.predict(X_test))
acc_lgbm = accuracy_score(y_test, best_lgbm.predict(X_test))

print(f"🎯 Akurasi GBM  : {acc_gbm:.4f}")
print(f"🎯 Akurasi LGBM : {acc_lgbm:.4f}")

if acc_lgbm >= acc_gbm:
    best_model = best_lgbm
    model_file = 'best_model_lgbm.pkl'
    print("✅ Menggunakan LightGBM sebagai model terbaik.")
else:
    best_model = best_gbm
    model_file = 'best_model_gbm.pkl'
    print("✅ Menggunakan GradientBoosting sebagai model terbaik.")

🎯 Akurasi GBM  : 0.9659
🎯 Akurasi LGBM : 0.9659
✅ Menggunakan LightGBM sebagai model terbaik.


In [40]:
# === 9. Simpan Model, Scaler, Label Encoder ===
joblib.dump(best_model, model_file)
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

In [18]:
print("Kolom di dataset:", df.columns.tolist())
print("Jumlah fitur (X):", X.shape[1])
print("Contoh input untuk prediksi:")
print(X[0])  # atau tampilkan X[0].shape

Kolom di dataset: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'label']
Jumlah fitur (X): 6
Contoh input untuk prediksi:


KeyError: 0