In [275]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

In [276]:
df = pd.read_csv("../soil.csv")
df.head(30)

Unnamed: 0,Location,Soil_Type,Fertility_Index,Land_Use_Type,Average_Rainfall(mm),Temperature(°C),Crop_Suitability,Season,Satellite_Observation_Date,Remarks
0,Sylhet,Loamy,62,Agricultural,72,28.6,Wheat,Monsoon,2024-09-24,Requires attention
1,Dhaka,Sandy,63,Unused,118,23.8,Maize,Autumn,2024-01-31,Moderate potential
2,Rangpur,Peaty,51,Agricultural,106,32.0,Maize,Autumn,2024-03-11,Requires attention
3,Khulna,Sandy,67,Barren,336,31.6,Wheat,Autumn,2024-09-29,Low potential
4,Rangpur,Peaty,63,Agricultural,237,20.1,Rice,Winter,2024-04-01,Moderate potential
5,Rajshahi,Clay,58,Barren,345,29.3,Vegetables,Monsoon,2024-01-12,High potential
6,Rajshahi,Silt,68,Barren,324,33.8,Tea,Winter,2024-09-06,Low potential
7,Chattogram,Loamy,49,Agricultural,71,27.0,Jute,Summer,2024-08-05,Moderate potential
8,Barishal,Silt,64,Agricultural,157,22.1,Vegetables,Monsoon,2024-12-21,High potential
9,Dhaka,Sandy,61,Residential,301,27.9,Rice,Autumn,2024-03-01,Requires attention


In [277]:
df['Satellite_Observation_Date'] = pd.to_datetime(df['Satellite_Observation_Date'], errors='coerce')
df['Year'] = df['Satellite_Observation_Date'].dt.year
df['Month'] = df['Satellite_Observation_Date'].dt.month
df['Day'] = df['Satellite_Observation_Date'].dt.day

In [278]:
#Dropping unused columns
df = df.drop(['Remarks'], axis='columns')

In [280]:
categorical_columns = [ 'Land_Use_Type', 'Season', 'Crop_Suitability']
df['Soil_Type'] = df['Soil_Type'].replace(['Clay', 'Loamy', 'Peaty', 'Sandy', 'Silt'], [1, 2, 3, 4, 5])
df['Season'] = df['Season'].replace(['Summer', 'Autumn', 'Winter', 'Monsoon'], [1, 2, 3, 4])
df['Land_Use_Type'] = df['Land_Use_Type'].replace(['Barren', 'Residential', 'Unused', 'Agricultural'], [1, 2, 3, 4])
df['Crop_Suitability'] = df['Crop_Suitability'].replace({'Rice': 0, 'Tea': 1, 'Spices':2, 'Wheat':3, 'Jute':4, 'Vegetables':5, 'Maize':6})

In [281]:
df.head(40)

Unnamed: 0,Location,Soil_Type,Fertility_Index,Land_Use_Type,Average_Rainfall(mm),Temperature(°C),Crop_Suitability,Season,Satellite_Observation_Date,Year,Month,Day
0,Sylhet,2,62,4,72,28.6,3,4,2024-09-24,2024,9,24
1,Dhaka,4,63,3,118,23.8,6,2,2024-01-31,2024,1,31
2,Rangpur,3,51,4,106,32.0,6,2,2024-03-11,2024,3,11
3,Khulna,4,67,1,336,31.6,3,2,2024-09-29,2024,9,29
4,Rangpur,3,63,4,237,20.1,0,3,2024-04-01,2024,4,1
5,Rajshahi,1,58,1,345,29.3,5,4,2024-01-12,2024,1,12
6,Rajshahi,5,68,1,324,33.8,1,3,2024-09-06,2024,9,6
7,Chattogram,2,49,4,71,27.0,4,1,2024-08-05,2024,8,5
8,Barishal,5,64,4,157,22.1,5,4,2024-12-21,2024,12,21
9,Dhaka,4,61,2,301,27.9,0,2,2024-03-01,2024,3,1


In [282]:
df.describe()

Unnamed: 0,Soil_Type,Fertility_Index,Land_Use_Type,Average_Rainfall(mm),Temperature(°C),Crop_Suitability,Season,Satellite_Observation_Date,Year,Month,Day
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000,2000.0,2000.0,2000.0
mean,2.9745,70.1045,2.501,223.136,27.33025,3.0195,2.4515,2024-07-02 02:06:43.200000,2024.0,6.55,15.2315
min,1.0,40.0,1.0,50.0,20.0,0.0,1.0,2024-01-01 00:00:00,2024.0,1.0,1.0
25%,2.0,54.0,1.0,137.0,23.5,1.0,1.0,2024-04-04 00:00:00,2024.0,4.0,8.0
50%,3.0,70.0,3.0,222.5,27.3,3.0,2.0,2024-07-03 00:00:00,2024.0,7.0,15.0
75%,4.0,86.0,3.0,308.0,31.0,5.0,3.0,2024-09-30 00:00:00,2024.0,9.0,23.0
max,5.0,100.0,4.0,400.0,35.0,6.0,4.0,2024-12-31 00:00:00,2024.0,12.0,31.0
std,1.397436,17.97699,1.111583,100.548543,4.341251,2.025135,1.122621,,0.0,3.412964,8.788488


In [283]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf
from sklearn.model_selection import cross_val_score

In [284]:
#Basic Random Forest Classifier

X = df[['Fertility_Index', 'Average_Rainfall(mm)', 'Temperature(°C)', 'Month', 'Soil_Type', 'Season']]
y = df['Crop_Suitability']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42, max_depth=9, n_estimators=100, max_leaf_nodes=9, max_features=25)
model.fit(X_train, y_train)

# Make predictions
test_predictions = model.predict(X_test)
val_predictions = model.predict(X_val)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, test_predictions)
val_accuracy = accuracy_score(y_val, val_predictions)
print(test_accuracy)
print(val_accuracy)

scores = cross_val_score(model, X_train, y_train, cv=10)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))
print("Standard deviation of cross-validation scores: {:.2f}".format(scores.std()))



0.1025
0.121875
Cross-validation scores: [0.1640625 0.15625   0.125     0.171875  0.1640625 0.140625  0.1875
 0.203125  0.0859375 0.171875 ]
Average cross-validation score: 0.16
Standard deviation of cross-validation scores: 0.03


In [285]:
#SVM(Support Vector Machines)

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm = SVC(gamma=0.1, C=0.0002)

svm.fit(X_train, y_train)
val_predictions=svm.predict(X_val)
test_predictions=svm.predict(X_test)

val_accuracy = accuracy_score(y_val, val_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print(val_accuracy)
print(test_accuracy)

scores = cross_val_score(svm, X_train, y_train, cv=10)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))
print("Standard deviation of cross-validation scores: {:.2f}".format(scores.std()))

0.1125
0.16
Cross-validation scores: [0.1640625 0.1640625 0.1640625 0.15625   0.1640625 0.1640625 0.1640625
 0.1640625 0.1640625 0.1640625]
Average cross-validation score: 0.16
Standard deviation of cross-validation scores: 0.00


In [288]:
#
import gc
from tensorflow.keras.regularizers import l2

print(X_train.shape)
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape =(6,)),
  tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(7)
])

model.compile(optimizer='adam', 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


class garbage_collect_callback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    gc.collect()

history = model.fit(X_train, # Train examples
          y_train, # Train labels
          epochs=50, # number of epochs (passes through data during training)
          batch_size= 200, # number of points to consider in each optimizer iteration
          callbacks = [garbage_collect_callback()],
          validation_data=(X_val, y_val), #data to use for validation
          verbose=1) #will print information about optimization process

test_binary_pred = model.predict(X_test)
scores = model.evaluate(X_test, y_test, verbose = 0)
print('\nTesting model on mock_test set:')
print(f'Model Loss: {scores[0]:.3f}, Model Accuracy: {scores[1]:.3f}')

(1280, 6)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Testing model on mock_test set:
Model Loss: 1.999, Model Accuracy: 0.112
