In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
# Constants
SEED = 42

In [10]:
# Loading the datasets
train_file_path = "dataset/train.csv"
train_df = pd.read_csv(train_file_path)
test_file_path = "dataset/test.csv"
test_df = pd.read_csv(test_file_path)
df = train_df

# DATA EXPLORATION

In [12]:
# Basic Info about Dataset
print("\n🔹 Dataset Info:")
print(df.info())


🔹 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB
None


In [13]:
# Checking for Missing Values
print("\n🔍 Missing Values Count:")
print(df.isnull().sum())


🔍 Missing Values Count:
id                 0
Temparature        0
Humidity           0
Moisture           0
Soil Type          0
Crop Type          0
Nitrogen           0
Potassium          0
Phosphorous        0
Fertilizer Name    0
dtype: int64


In [14]:
# Summary Statistics
print("\n📊 Summary Statistics (Numerical Features):")
print(df.describe())


📊 Summary Statistics (Numerical Features):
                  id    Temparature       Humidity       Moisture  \
count  750000.000000  750000.000000  750000.000000  750000.000000   
mean   374999.500000      31.503565      61.038912      45.184147   
std    216506.495284       4.025574       6.647695      11.794594   
min         0.000000      25.000000      50.000000      25.000000   
25%    187499.750000      28.000000      55.000000      35.000000   
50%    374999.500000      32.000000      61.000000      45.000000   
75%    562499.250000      35.000000      67.000000      55.000000   
max    749999.000000      38.000000      72.000000      65.000000   

            Nitrogen      Potassium    Phosphorous  
count  750000.000000  750000.000000  750000.000000  
mean       23.093808       9.478296      21.073227  
std        11.216125       5.765622      12.346831  
min         4.000000       0.000000       0.000000  
25%        13.000000       4.000000      10.000000  
50%        23.00

In [16]:
le = LabelEncoder()
train_df['Fertilizer_name_encoded'] = le.fit_transform(train_df['Fertilizer Name'].astype(str))

In [22]:
train_df

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name,Fertilizer_name_encoded
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28,4
1,1,27,69,65,Sandy,Millets,30,6,18,28-28,4
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17,2
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26,0
4,4,35,58,43,Red,Paddy,37,2,16,DAP,5
...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28,4
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17,2
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26,0
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20,3


In [28]:
for col in ['Soil Type', 'Crop Type']:
    if train_df[col].dtype == 'object' or str(train_df[col].dtype) == 'category':
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col].astype(str))


for col in ['Soil Type', 'Crop Type']:
    if test_df[col].dtype == 'object' or str(test_df[col].dtype) == 'category':
        le = LabelEncoder()
        test_df[col] = le.fit_transform(test_df[col].astype(str))



In [29]:
train_df

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name,Fertilizer_name_encoded
0,0,37,70,36,1,8,36,4,5,28-28,4
1,1,27,69,65,4,4,30,6,18,28-28,4
2,2,29,63,32,4,4,24,12,16,17-17-17,2
3,3,35,62,54,4,0,39,12,4,10-26-26,0
4,4,35,58,43,3,6,37,2,16,DAP,5
...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,1,3,8,16,6,28-28,4
749996,749996,37,64,58,2,8,38,8,20,17-17-17,2
749997,749997,35,68,59,4,2,6,11,29,10-26-26,0
749998,749998,31,68,29,3,1,9,11,12,20-20,3


In [30]:
train_model = train_df
test_model = test_df
train_X = train_model.drop(columns=['Fertilizer Name', 'Fertilizer_name_encoded'])
Y_target = train_df['Fertilizer_name_encoded']
test_X = test_model