## **Crop Recommendation System API**

In [22]:
import pandas as pd
import plotly.express as px

In [23]:
# 1. Load dataset
df = pd.read_csv("Crop_recommendation.csv")

# data preprocessing

In [24]:
# 2. Check for missing values
missing = df.isnull().sum()
print("Missing values per column:\n", missing)

Missing values per column:
 N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64


In [25]:
# 4. If missing values exist, decide how to handle:
num_cols = df.select_dtypes(include='number').columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        mean_val = df[col].mean()
        df[col].fillna(mean_val, inplace=True)
        print(f"Filled missing values in {col} with mean: {mean_val:.2f}")

In [26]:
# 5. Visualize feature distributions grouped by crop label
fig = px.box(df, x='label', y='N', title='Nitrogen Content Distribution by Crop', points="all")
fig.show()

In [27]:
# 6. Pair plot (scatter matrix) of some key features colored by crop label
fig = px.scatter_matrix(
    df,
    dimensions=['N', 'P', 'K', 'temperature', 'humidity'],
    color='label',
    title='Scatter Matrix of Nutrients and Environment by Crop'
)
fig.update_traces(diagonal_visible=False)
fig.show()

## Model Training

In [28]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report



In [29]:
# 2. Features and target
X = df.drop("label", axis=1)
y = df["label"]


In [30]:
# 3. Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [31]:
# 4. Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [32]:
# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42
)

In [33]:
# Define Models
rf_model = RandomForestClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
knn_model = KNeighborsClassifier()
lr_model = LogisticRegression(max_iter=1000)



In [34]:
# 6. Define parameter grids

rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

dt_params = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4, 6],
    'criterion': ['gini', 'entropy']
}

knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

lr_params = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}


In [35]:
# Setup GridSearchCV
rf_grid = GridSearchCV(estimator=rf_model, param_grid=rf_params, cv=3, n_jobs=-1)
dt_grid = GridSearchCV(estimator=dt_model, param_grid=dt_params, cv=3, n_jobs=-1)
knn_grid = GridSearchCV(estimator=knn_model, param_grid=knn_params, cv=3, n_jobs=-1)
lr_grid = GridSearchCV(estimator=lr_model, param_grid=lr_params, cv=3, n_jobs=-1)

In [36]:
# Fit Grid Search
rf_grid.fit(X_train, y_train)
print("\n✅ Best Random Forest Params:", rf_grid.best_params_)

dt_grid.fit(X_train, y_train)
print("\n✅ Best Decision Tree Params:", dt_grid.best_params_)

knn_grid.fit(X_train, y_train)
print("\n✅ Best KNN Params:", knn_grid.best_params_)

lr_grid.fit(X_train, y_train)
print("\n✅ Best Logistic Regression Params:", lr_grid.best_params_)



✅ Best Random Forest Params: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}

✅ Best Decision Tree Params: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 4}

✅ Best KNN Params: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

✅ Best Logistic Regression Params: {'C': 10, 'solver': 'saga'}


# Model export

In [38]:
# 8. Evaluate and save best models
import pickle
models = {
    "Random Forest": rf_grid.best_estimator_,
    "Decision Tree": dt_grid.best_estimator_,
    "KNN": knn_grid.best_estimator_,
    "Logistic Regression": lr_grid.best_estimator_,
}

print("\n📊 Model Evaluation:")
for name, model in models.items():
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    with open(f"{name.lower().replace(' ', '_')}_model.pkl", 'wb') as file:
      pickle.dump(model, file)




# Save the encoder and scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(le, file)

print("\n✅ All best models, label encoder, and scaler saved.")


📊 Model Evaluation:

--- Random Forest ---
Accuracy: 0.9932

--- Decision Tree ---
Accuracy: 0.9864

--- KNN ---
Accuracy: 0.9750

--- Logistic Regression ---
Accuracy: 0.9705

✅ All best models, label encoder, and scaler saved.


In [43]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(estimators=[
    ('rf', rf_grid.best_estimator_),
    ('dt', dt_grid.best_estimator_),
    ('knn', knn_grid.best_estimator_),
    ('lr', lr_grid.best_estimator_)
], voting='soft')

ensemble.fit(X_train, y_train)
ensemble_preds = ensemble.predict(X_test)
ensemble_acc = accuracy_score(y_test, ensemble_preds)

print(f"\n🧪 Ensemble Accuracy: {ensemble_acc:.4f}")


🧪 Ensemble Accuracy: 0.9864


In [42]:

with open("ensemble_model.pkl", "wb") as file:
    pickle.dump(ensemble, file)

print("\n💾 Ensemble model saved as 'ensemble_model.pkl'")


💾 Ensemble model saved as 'ensemble_model.pkl'
