In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder,StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,mean_squared_error,confusion_matrix,classification_report

# Load Dataset

In [None]:
# load the dataset 
data=pd.read_csv('/kaggle/input/crop-recommendation-dataset/Crop_recommendation.csv')

In [None]:
# show the top 5 rows of dataset 
data.head()

In [None]:
# checking the shape of dataset 
data.shape

In [None]:
# checking the columns 
data.columns 

In [None]:
# info of dataset 
data.info()

In [None]:
# checking the missing values in dataset 
data.isnull().sum()

In [None]:
# checking the duplicated values in dataset 
data.duplicated().sum()

In [None]:
# statistical summary of dataset numerical columns 
data.describe()

In [None]:
# checking the unique values of label
data['label'].unique()

In [None]:
# checking the values of each labels 
data['label'].value_counts()

In [None]:
# Distribution of numerical features
num_features = ['N','P','K','temperature','humidity','ph','rainfall']

plt.figure(figsize=(15,10))
for i, col in enumerate(num_features, 1):
    plt.subplot(3, 3, i)
    sns.histplot(data[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(data[num_features].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap of Features")
plt.show()

In [None]:
#  Boxplots to detect outliers
plt.figure(figsize=(15,10))
for i, col in enumerate(num_features, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=data[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
#  Average feature values by crop

plt.figure(figsize=(12,6))
avg_values = data.groupby("label")[['N','P','K']].mean()
avg_values.plot(kind="bar", figsize=(14,6))
plt.title("Average NPK values per Crop")
plt.ylabel("Mean Value")
plt.show()

In [None]:
#  Pairplot (sample for visualization due to large dataset)
sample_df = data.sample(500, random_state=42)
sns.pairplot(sample_df, hue="label", vars=['N','P','K','temperature'])
plt.show()

# add the some new columns 

In [None]:
def feature_engineer(data):
    data['NPK'] = (data['N'] + data['P'] + data['K']) / 3
    data['THI'] = data['temperature'] * data['humidity'] / 100
    data['rainfall_level'] = pd.cut(data['rainfall'],
                              bins=[0, 50, 100, 200, 300],
                              labels=['Low', 'Medium', 'High', 'Very High'])
    def ph_category(p):
        if p < 5.5:
            return 'Acidic'
        elif p <= 7.5:
            return 'Neutral'
        else:
            return 'Alkaline'
    
    data['ph_category'] = data['ph'].apply(ph_category)
    data['temp_rain_interaction'] = data['temperature'] * data['rainfall']
    data['ph_rain_interaction'] = data['ph'] * data['rainfall']

    return data

In [None]:
data_fe = feature_engineer(data)
data_fe.head()

In [None]:
# encoding of string columns 
ec=LabelEncoder()
data['label']=ec.fit_transform(data['label'])
data['rainfall_level']=ec.fit_transform(data['rainfall_level'])
data['ph_category']=ec.fit_transform(data['ph_category'])

In [None]:
# feature engineering 
X=data.drop('label',axis=1)
y=data['label']

In [None]:
# model selection devide the data in traning and testing dataset 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# feature scaling 
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [None]:
# train the RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=5)
rfc.fit(X_train,y_train)

In [None]:
rfc.score(X_train,y_train)

In [None]:
y_pred_rfc=rfc.predict(X_test)

In [None]:
print('accuracy score:',accuracy_score(y_test,y_pred_rfc))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_rfc))
print('classification report:',classification_report(y_test,y_pred_rfc))

In [None]:
# train the Machine leaning Support Vector machine model
from sklearn.svm import SVC
svm=SVC(kernel='rbf')
svm.fit(X_train,y_train)

In [None]:
y_pred_svm=svm.predict(X_test)

In [None]:
print('accuracy score:',accuracy_score(y_test,y_pred_svm))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_svm))
print('classification report:',classification_report(y_test,y_pred_svm))

In [None]:
# train the ML XGBclassifier model
xgb=XGBClassifier(random_state=42)
xgb.fit(X_train,y_train)

In [None]:
y_pred_xgb=xgb.predict(X_test)

In [None]:
print('accuracy score:',accuracy_score(y_test,y_pred_xgb))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_xgb))
print('classification report:',classification_report(y_test,y_pred_xgb))