In [211]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib

In [212]:
# Load dataset (replace with your crop dataset)
data = pd.read_csv("D:/RBL PROJECT/Crop--Prediction-ML--main/CP/datasets/crop_production.csv")  # Example dataset

In [213]:
# Data Cleaning
data_clean = data.replace([np.inf, -np.inf], np.nan).dropna(subset=['Production'])
print(f"Remaining rows after cleaning: {len(data_clean)}")

Remaining rows after cleaning: 242361


In [214]:
# Feature Engineering
X = data_clean[['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area']]
y = data_clean['Production']

In [215]:
# Encode categorical columns
label_encoders = {}
for column in ['State_Name', 'District_Name', 'Season', 'Crop']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [216]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [217]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost Model
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [218]:
# Train model
model.fit(X_train_scaled, y_train)

# Evaluation
y_pred = model.predict(X_test_scaled)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")


MAE: 289099.49
RMSE: 5816936.31
R²: 0.92


In [219]:
# Save artifacts
joblib.dump(model, "xgboost_production_model.joblib")
joblib.dump(label_encoders, "label_encoders.joblib")
joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

In [220]:
# Crop Recommendation Function
def recommend_best_crop(state, district, year, season, area):
    """Predicts production for all crops and recommends the best one"""
    # Get unique crops
    unique_crops = data_clean['Crop'].unique()
    
    # Prepare input for all crops
    test_data = pd.DataFrame({
        'State_Name': [state] * len(unique_crops),
        'District_Name': [district] * len(unique_crops),
        'Crop_Year': [year] * len(unique_crops),
        'Season': [season] * len(unique_crops),
        'Crop': unique_crops,
        'Area': [area] * len(unique_crops)
    })

In [221]:
 # Encode and scale
for column in ['State_Name', 'District_Name', 'Season', 'Crop']:
    le = label_encoders[column]
    test_data[column] = test_data[column].apply(
        lambda x: x if x in le.classes_ else 'unknown'
    )
    test_data[column] = le.transform(test_data[column])
    
test_scaled = scaler.transform(test_data)

NameError: name 'test_data' is not defined

In [None]:
 # Predict production for all crops
    predictions = model.predict(test_scaled)
    
    # Create results dataframe
    results = pd.DataFrame({
        'Crop': unique_crops,
        'Predicted_Production': predictions
    })

In [None]:
# Sort by predicted production (descending)
    results = results.sort_values('Predicted_Production', ascending=False)
    
    return results

Mean Absolute Error (MAE): 289099.49
Root Mean Squared Error (RMSE): 5816936.31
R² Score: 0.92


In [None]:
# Example Usage
if __name__ == "__main__":
    # Example prediction
    test_input = pd.DataFrame({
        'State_Name': ['Punjab'],
        'District_Name': ['Amritsar'],
        'Crop_Year': [2023],
        'Season': ['Kharif'],
        'Crop': ['Rice'],
        'Area': [5000]
    })
    

Model saved successfully!


In [None]:
 # Single prediction
    for column in ['State_Name', 'District_Name', 'Season', 'Crop']:
        le = label_encoders[column]
        test_input[column] = test_input[column].apply(
            lambda x: x if x in le.classes_ else 'unknown'
        )
        test_input[column] = le.transform(test_input[column])
    
    test_scaled = scaler.transform(test_input)
    prediction = model.predict(test_scaled)
    print(f"\nPredicted Production for Rice: {prediction[0]:.2f} units")

In [None]:
# Crop recommendation
    recommendations = recommend_best_crop(
        state='Punjab',
        district='Amritsar',
        year=2023,
        season='Kharif',
        area=5000
    )
    
    print("\nTop 5 Crop Recommendations:")
    print(recommendations.head(5).to_string(index=False))