In [22]:
import pandas as pd
import xgboost as xg 
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('Crop_recommendation.csv')

data


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice
...,...,...,...,...,...,...,...,...
1692,117,86,48,28.695620,82.541958,6.225225,116.161684,banana
1693,114,94,53,26.335449,76.853201,6.190757,118.685826,banana
1694,110,78,50,25.937302,78.898644,5.915569,98.217475,banana
1695,94,70,48,25.136865,84.883944,6.195152,91.464425,banana


In [3]:
data['temperature'] = data['temperature'].astype(int)
data['humidity'] = data['humidity'].astype(int)
data['ph'] = data['ph'].astype(int)
data['rainfall'] = data['rainfall'].astype(int)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697 entries, 0 to 1696
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   N            1697 non-null   int64 
 1   P            1697 non-null   int64 
 2   K            1697 non-null   int64 
 3   temperature  1697 non-null   int64 
 4   humidity     1697 non-null   int64 
 5   ph           1697 non-null   int64 
 6   rainfall     1697 non-null   int64 
 7   label        1697 non-null   object
dtypes: int64(7), object(1)
memory usage: 106.2+ KB


In [4]:
data['label'].unique()

array(['rice', 'maize', 'Soyabeans', 'beans', 'peas', 'groundnuts',
       'cowpeas', 'banana', 'mango', 'grapes', 'watermelon', 'apple',
       'orange', 'cotton', 'coffee'], dtype=object)

In [5]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

data['label']


0       13
1       13
2       13
3       13
4       13
        ..
1692     2
1693     2
1694     2
1695     2
1696     2
Name: label, Length: 1697, dtype: int64

In [6]:
data.describe()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
count,1697.0,1697.0,1697.0,1697.0,1697.0,1697.0,1697.0,1697.0
mean,52.647024,58.126105,52.031232,24.221567,65.338244,5.908662,98.875074,6.91043
std,38.536964,34.164342,57.068796,4.90888,24.437587,0.91915,50.432268,4.380918
min,0.0,5.0,5.0,8.0,14.0,3.0,5.0,0.0
25%,21.0,36.0,19.0,21.0,51.0,5.0,66.0,3.0
50%,37.0,54.0,27.0,24.0,77.0,6.0,93.0,7.0
75%,90.0,72.0,52.0,27.0,83.0,6.0,115.0,11.0
max,140.0,145.0,205.0,41.0,94.0,9.0,298.0,14.0


In [7]:
data[
    data.duplicated(subset=["temperature", "humidity", "ph", "rainfall"], keep=False)
].sort_values(by=["temperature", "humidity", "ph", "rainfall"])

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
911,27,145,205,9,82,5,66,7
999,35,134,204,9,82,5,66,7
1213,19,7,10,14,91,6,100,11
1651,19,7,10,14,91,6,100,11
334,1,62,23,15,18,5,139,3
...,...,...,...,...,...,...,...,...
1650,6,9,12,31,90,7,109,11
833,12,34,28,33,45,6,98,10
864,39,37,25,33,45,6,98,10
1216,31,8,7,34,93,7,103,11


In [8]:
data = data.drop_duplicates(
    subset=["N", "P", "K", "temperature", "humidity", "ph", "rainfall"]
)
data.shape

(1594, 8)

In [27]:
x = data.drop(["label", "N", "K", "humidity"],axis=1) #"N", "K", "humidity",
y = data['label']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.10,shuffle=True)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1434, 4)
(160, 4)
(1434,)
(160,)


In [28]:
crop_xg_model = xg.XGBClassifier(
    objective='multi:softprob',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
)

rf_model = RandomForestClassifier(n_estimators=100)

rf_model.fit(x_train, y_train)

In [29]:
preds = rf_model.predict(x_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         8
           3       0.78      0.78      0.78         9
           4       1.00      1.00      1.00        11
           5       0.71      0.83      0.77        12
           6       1.00      1.00      1.00         6
           7       1.00      1.00      1.00        15
           8       0.92      1.00      0.96        11
           9       0.71      0.56      0.62         9
          10       1.00      1.00      1.00        11
          11       1.00      1.00      1.00        14
          12       0.80      0.80      0.80        10
          13       1.00      1.00      1.00        14
          14       1.00      1.00      1.00        10

    accuracy                           0.93       160
   macro avg       0.93      0.93      0.93       160
weighted avg       0.93   

In [30]:
model_accuracy = 100 * rf_model.score(x_test, y_test)

print(f'The model has an accuracy of {model_accuracy:.2f}%')

The model has an accuracy of 93.12%


In [31]:
import joblib

joblib.dump(rf_model, 'crop_model.pkl')

['crop_model.pkl']

In [21]:
print(le.classes_)

# from typing import List

# crops: List = ['Soyabeans', 'apple', 'banana', 'beans', 'coffee', 'cotton', 'cowpeas', 'grapes',
#  'groundnuts', 'maize', 'mango', 'orange', 'peas', 'rice', 'watermelon']

# crops_dict = dict(enumerate(crops))
# crops_dict

['Soyabeans' 'apple' 'banana' 'beans' 'coffee' 'cotton' 'cowpeas' 'grapes'
 'groundnuts' 'maize' 'mango' 'orange' 'peas' 'rice' 'watermelon']
