In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.svm import LinearSVR
from sklearn import linear_model
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor
from sklearn.metrics import (mean_squared_error as mse, r2_score, explained_variance_score,
                             max_error, mean_absolute_error, median_absolute_error)

REGRESSION

In [None]:
dataset = pd.read_csv("CO22339_Soil_Profile.csv")
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

In [None]:
dataset.head()

Unnamed: 0,monocult,RYT_N,NUE
0,,1.141,377.234
1,,1.912,360.497
2,,1.568,307.339
3,,1.1,415.966
4,,1.066,409.18


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
X = imputer.fit_transform(X)
y = y.reshape(-1, 1)
y = imputer.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

In [None]:
scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
y_test = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

In [None]:
# Final stacking layer
final_layer = StackingRegressor(
    estimators=[
        ('xgbr', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8)),
        ('sgdr', SGDRegressor(max_iter=3000, tol=1e-3)),
        ('dtr', DecisionTreeRegressor(max_depth=10, random_state=42)),
        ('adbr', AdaBoostRegressor(n_estimators=100, random_state=42)),
        ('gbrt', HistGradientBoostingRegressor(max_iter=300, learning_rate=0.1, random_state=42)),
        ('rf', RandomForestRegressor(n_estimators=200, max_depth=12, max_features='sqrt', random_state=42)),
        ('etr', ExtraTreesRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=42))
    ],
    final_estimator=RidgeCV(),
    passthrough=True,
    n_jobs=-1
)

# Full model
multi_layer_regressor = StackingRegressor(
    estimators=[
        ('lr', RidgeCV()),
        ('lasso', LassoCV(random_state=42)),
        ('lrr', LinearRegression()),
        ('nn', MLPRegressor(hidden_layer_sizes=(250, 700, 102, 51),
                            max_iter=3500, activation='relu', solver='adam', alpha=0.01, random_state=42)),
        ('svr', SVR(C=1, gamma=1e-6, kernel='rbf'))
    ],
    final_estimator=final_layer,
    passthrough=False,
    n_jobs=-1
)

# Train and evaluate
multi_layer_regressor.fit(X_train, y_train)
y_pred = multi_layer_regressor.predict(X_test)

# De-normalize
y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).ravel()
y_true = scaler_y.inverse_transform(y_test.reshape(-1, 1)).ravel()

# Metrics
print("Mean Squared Error:", mse(y_true, y_pred))
print("R2 Score:", r2_score(y_true, y_pred))

Mean Squared Error: 1.1117931894745297
R2 Score: 0.009808028314353234


In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
n_samples = X_train.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
score=cross_val_score(multi_layer_regressor, X_train, y_train, cv=cv,scoring='neg_mean_squared_error', n_jobs=-1)
print(score)
print("Error: {:.2f} ".format(score.mean()))
print("Standard Deviation: {:.2f} %".format(score.std()*100))

[-0.92531307 -1.1548448  -1.08346811 -0.94802431 -0.87655476]
Error: -1.00 
Standard Deviation: 10.43 %


In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from time import time

cv = KFold(n_splits=5, shuffle=True, random_state=42)

param_dist = {
    'final_estimator__xgbr__max_depth': [5, 7, 10, 15],
    'final_estimator__xgbr__n_estimators': [100, 200, 300],
    'final_estimator__xgbr__learning_rate': [0.01, 0.05, 0.1],

    'final_estimator__etr__max_features': [0.5, 0.75, 1.0],
    'final_estimator__etr__max_depth': [5, 10, 15],
    'final_estimator__etr__n_estimators': [100, 150],

    'final_estimator__adbr__n_estimators': [50, 100, 200],

    'svr__gamma': ['scale', 'auto', 1e-6, 1e-5],
    'svr__C': [0.1, 1, 10]
}

n_iter_search = 15
random_search = RandomizedSearchCV(
    estimator=multi_layer_regressor,
    param_distributions=param_dist,
    n_iter=n_iter_search,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    random_state=42

)

In [None]:
start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
      % ((time() - start), n_iter_search))

# 5. Show best parameters and score
print("Best parameters found:\n", random_search.best_params_)
print("Best CV score (negative MSE):", random_search.best_score_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
RandomizedSearchCV took 3391.65 seconds for 15 candidates parameter settings.
Best parameters found:
 {'svr__gamma': 1e-05, 'svr__C': 10, 'final_estimator__xgbr__n_estimators': 200, 'final_estimator__xgbr__max_depth': 15, 'final_estimator__xgbr__learning_rate': 0.01, 'final_estimator__etr__n_estimators': 100, 'final_estimator__etr__max_features': 0.75, 'final_estimator__etr__max_depth': 10, 'final_estimator__adbr__n_estimators': 100}
Best CV score (negative MSE): -0.9894397157836157


In [None]:
best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_pred_best = scaler_y.inverse_transform(y_pred_best.reshape(-1, 1)).ravel()
y_true = scaler_y.inverse_transform(y_test.reshape(-1, 1)).ravel()

print("Test MSE:", mse(y_true, y_pred_best))
print("Test R2:", r2_score(y_true, y_pred_best))
print(f"Mean Squared Error is {mse(y_test,y_pred_best)}")
print(f"R2 Score is {r2_score(y_test,y_pred_best)}")
print(f"Explained Variance Score is {explained_variance_score(y_test,y_pred_best)}")
print(f"Maximum Error is {max_error(y_test,y_pred_best)}")
print(f"Mean Absolute Error is {mean_absolute_error(y_test,y_pred_best)}")
print(f"median_absolute_error is {median_absolute_error(y_test,y_pred_best)}")

Test MSE: 1.1273349363949834
Test R2: -0.004033856284595405
Mean Squared Error is 1.1273349363949834
R2 Score is -0.004033856284595405
Explained Variance Score is 0.0017279758973095527
Maximum Error is 3.3752702658677127
Mean Absolute Error is 0.8676402122554224
median_absolute_error is 0.760008202633063


In [None]:
y_pred0 = multi_layer_regressor.predict(X_test)
y_pred0_restored=scaler_y.inverse_transform(y_pred0.reshape(-1, 1))
#mse(y_test, y_pred)
from sklearn.metrics import r2_score
print(f"Mean Squared Error is {mse(y_test,y_pred0_restored)}")
print(f"R2 Score is {r2_score(y_test,y_pred0)}")
print(f"Expained Variance Score is {explained_variance_score(y_test,y_pred0_restored)}")
print(f"Maximum Error is {max_error(y_test,y_pred0_restored)}")
print(f"Mean Absolute Error is {mean_absolute_error(y_test,y_pred0_restored)}")
print(f"median_absolute_error is {median_absolute_error(y_test,y_pred0_restored)}")

Mean Squared Error is 1.1117931894745297
R2 Score is 0.009808028314353234
Expained Variance Score is 0.018665340818070253
Maximum Error is 3.340248756777224
Mean Absolute Error is 0.8635835357151185
median_absolute_error is 0.7585298876366676


In [None]:
# Modified Model
final_layer = StackingRegressor(
    estimators=[
        ('xgbr', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5,
                              subsample=0.8, colsample_bytree=0.8, random_state=42)),
        ('sgdr', SGDRegressor(max_iter=3000, tol=1e-3)),
        ('dtr', DecisionTreeRegressor(max_depth=10, random_state=42)),
        ('adbr', AdaBoostRegressor(n_estimators=100, random_state=42)),
        ('gbrt', HistGradientBoostingRegressor(max_iter=300, learning_rate=0.1, random_state=42)),
        ('rf', RandomForestRegressor(n_estimators=200, max_depth=12, max_features='sqrt', random_state=42)),
        ('etr', ExtraTreesRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=42))
    ],
    final_estimator=RidgeCV(),
    passthrough=True,
    n_jobs=-1
)

multi_layer_regressor = StackingRegressor(
    estimators=[
        ('lr', RidgeCV()),
        ('lasso', LassoCV(random_state=42)),
        ('lrr', LinearRegression()),
        ('nn', MLPRegressor(hidden_layer_sizes=(250, 700, 102, 51),
                            max_iter=3500, activation='relu', solver='adam', alpha=0.01, random_state=42)),
        ('svr', SVR(C=1, gamma=1e-6, kernel='rbf'))
    ],
    final_estimator=final_layer,
    passthrough=False,
    n_jobs=-1
)

multi_layer_regressor.fit(X_train, y_train)


In [None]:
y_pred = multi_layer_regressor.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).ravel()
y_true = scaler_y.inverse_transform(y_test.reshape(-1, 1)).ravel()
print(f"Mean Squared Error is {mse(y_test,y_pred)}")
print(f"R2 Score is {r2_score(y_test,y_pred)}")
print(f"Expained Variance Score is {explained_variance_score(y_test,y_pred)}")
print(f"Maximum Error is {max_error(y_test,y_pred)}")
print(f"Mean Absolute Error is {mean_absolute_error(y_test,y_pred)}")
print(f"median_absolute_error is {median_absolute_error(y_test,y_pred)}")

Mean Squared Error is 1.1154170493328706
R2 Score is 0.006580524339500737
Expained Variance Score is 0.014724699433956712
Maximum Error is 3.3789382392701826
Mean Absolute Error is 0.8647588081542701
median_absolute_error is 0.7700148414465199


CLASSIFICATION

In [None]:
import pandas as pd
from sklearn.ensemble import (
    StackingClassifier, RandomForestClassifier, AdaBoostClassifier,
    ExtraTreesClassifier, HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifierCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_score, recall_score, f1_score
)

In [None]:
dataset1 = pd.read_csv('CO22339soiLMEASURES.csv')
X1 = dataset1.iloc[:, 0:-1].values
y1 = dataset1.iloc[:, -1].values

In [None]:
dataset1.head()

Unnamed: 0,N,P,K,ph,crop
0,90,42,43,6.502985,rice
1,85,58,41,7.038096,rice
2,60,55,44,7.840207,rice
3,74,35,40,6.980401,rice
4,78,42,42,7.628473,rice


In [None]:
from sklearn.preprocessing import LabelEncoder
Labelencoder = LabelEncoder()
y1 = Labelencoder.fit_transform(y1)

In [None]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1_train = sc.fit_transform(X1_train)
X1_test = sc.transform(X1_test)

In [None]:
final_layer = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5,
                              subsample=0.8, colsample_bytree=0.8,
                              use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('sgd', SGDClassifier(max_iter=3000, tol=1e-3, random_state=42)),
        ('dt', DecisionTreeClassifier(max_depth=10, random_state=42)),
        ('ada', AdaBoostClassifier(n_estimators=100, random_state=42)),
        ('hgb', HistGradientBoostingClassifier(max_iter=300, learning_rate=0.1, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=12, max_features='sqrt', random_state=42)),
        ('et', ExtraTreesClassifier(n_estimators=100, max_depth=5, max_features=0.5, random_state=42))
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=True,
    n_jobs=-1
)


multi_layer_classifier = StackingClassifier(
    estimators=[
        ('ridge', RidgeClassifierCV()),
        ('logreg', LogisticRegression(max_iter=1000)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(250, 700, 102, 51),
                              max_iter=3000, activation='relu', solver='adam', alpha=0.01, random_state=42)),
        ('svc', SVC(C=1, gamma=1e-6, kernel='rbf', probability=True))
    ],
    final_estimator=final_layer,
    passthrough=False,
    n_jobs=-1
)

multi_layer_classifier.fit(X1_train, y1_train)
y1_pred = multi_layer_classifier.predict(X1_test)

In [None]:
print("Accuracy:", accuracy_score(y1_test, y1_pred))
print("Precision:", precision_score(y1_test, y1_pred, average='weighted', zero_division=0))
print("Recall:", recall_score(y1_test, y1_pred, average='weighted'))
print("F1 Score:", f1_score(y1_test, y1_pred, average='weighted'))
print("\nConfusion Matrix:\n", confusion_matrix(y1_test, y1_pred))
print("\nClassification Report:\n", classification_report(y1_test, y1_pred))

Accuracy: 0.9688
Precision: 0.9696479464882944
Recall: 0.9688
F1 Score: 0.9687893416074405

Confusion Matrix:
 [[314   0   0   0]
 [  0 296   0  10]
 [  0   0 312   0]
 [  0  29   0 289]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       314
           1       0.91      0.97      0.94       306
           2       1.00      1.00      1.00       312
           3       0.97      0.91      0.94       318

    accuracy                           0.97      1250
   macro avg       0.97      0.97      0.97      1250
weighted avg       0.97      0.97      0.97      1250



In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_dist = {
    'mlp__hidden_layer_sizes': [(100,), (250, 150), (250, 700, 100)],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [1e-3, 1e-4, 1e-6]
}

search = RandomizedSearchCV(
    estimator=multi_layer_classifier,
    param_distributions=param_dist,
    n_iter=5,
    cv=cv,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

search.fit(X1_train, y1_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [None]:
print(search.best_params_)

best_model = search.best_estimator_

y1_pred = best_model.predict(X1_test)

# print(classification_report(y1_test, y1_pred))
# print("Accuracy:", accuracy_score(y1_test, y1_pred))

{'svc__gamma': 0.0001, 'svc__C': 1, 'mlp__hidden_layer_sizes': (250, 150)}


In [None]:
# Final estimator layer
final_layer = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5,
                              subsample=0.8, colsample_bytree=0.8,
                              use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('sgd', SGDClassifier(max_iter=3000, tol=1e-3, random_state=42)),
        ('dt', DecisionTreeClassifier(max_depth=10, random_state=42)),
        ('ada', AdaBoostClassifier(n_estimators=100, random_state=42)),
        ('hgb', HistGradientBoostingClassifier(max_iter=300, learning_rate=0.1, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=12, max_features='sqrt', random_state=42)),
        ('et', ExtraTreesClassifier(n_estimators=100, max_depth=5, max_features=0.5, random_state=42))
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=True,
    n_jobs=-1
)

# Updated multi-layer stack with tuned hyperparameters
multi_layer_classifier = StackingClassifier(
    estimators=[
        ('ridge', RidgeClassifierCV()),
        ('logreg', LogisticRegression(max_iter=1000)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(250, 150),  # <-- Updated here
                              max_iter=3000, activation='relu', solver='adam', alpha=0.01, random_state=42)),
        ('svc', SVC(C=1, gamma=0.0001, kernel='rbf', probability=True))  # <-- Updated here
    ],
    final_estimator=final_layer,
    passthrough=False,
    n_jobs=-1
)

# Fit the model and predict
multi_layer_classifier.fit(X1_train, y1_train)
y1_pred0 = multi_layer_classifier.predict(X1_test)


In [None]:
print("Accuracy:", accuracy_score(y1_test, y1_pred0))
print("Precision:", precision_score(y1_test, y1_pred0, average='weighted', zero_division=0))
print("Recall:", recall_score(y1_test, y1_pred0, average='weighted'))
print("F1 Score:", f1_score(y1_test, y1_pred0, average='weighted'))
print("\nConfusion Matrix:\n", confusion_matrix(y1_test, y1_pred0))
print("\nClassification Report:\n", classification_report(y1_test, y1_pred0))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Confusion Matrix:
 [[314   0   0   0]
 [  0 306   0   0]
 [  0   0 312   0]
 [  0   0   0 318]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       314
           1       1.00      1.00      1.00       306
           2       1.00      1.00      1.00       312
           3       1.00      1.00      1.00       318

    accuracy                           1.00      1250
   macro avg       1.00      1.00      1.00      1250
weighted avg       1.00      1.00      1.00      1250

