In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [3]:
# Load your data
data = pd.read_excel('/Users/likhithkanigolla/IIITH/code-files/Digital-Twin/ZF/Soil_test/Soil Data.xlsx')

# Handle missing values (if any)
# data.fillna(method='ffill', inplace=True) # Example method to fill missing values

# Separate features and target
X = data.drop(columns=['entry_id', 'tdsValue', 'Voltage','tdsValue_without_temp'])
y = data['tdsValue']

# Encode categorical variables if necessary
# X = pd.get_dummies(X, drop_first=True) # Example for one-hot encoding

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluate the model
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

print(f'Linear Regression - MAE: {lr_mae}, MSE: {lr_mse}, R²: {lr_r2}')

Linear Regression - MAE: 10.291481541374573, MSE: 1096.150488183664, R²: 0.34554971002965584


## Random Forest Regressor

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the model
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

print(f'Random Forest - MAE: {rf_mae}, MSE: {rf_mse}, R²: {rf_r2}')

Random Forest - MAE: 10.016005797262086, MSE: 1043.0483177500867, R²: 0.377254052830155


## Gradient Boosting Regressor

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

# Train the model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test_scaled)

# Evaluate the model
gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_mse = mean_squared_error(y_test, y_pred_gb)
gb_r2 = r2_score(y_test, y_pred_gb)

print(f'Gradient Boosting - MAE: {gb_mae}, MSE: {gb_mse}, R²: {gb_r2}')

Gradient Boosting - MAE: 9.6912412581883, MSE: 1102.4847677608416, R²: 0.34176786515459123


## Support Vector Regressor

In [8]:
from sklearn.svm import SVR

# Train the model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svr = svr_model.predict(X_test_scaled)

# Evaluate the model
svr_mae = mean_absolute_error(y_test, y_pred_svr)
svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_r2 = r2_score(y_test, y_pred_svr)

print(f'Support Vector Regressor - MAE: {svr_mae}, MSE: {svr_mse}, R²: {svr_r2}')

Support Vector Regressor - MAE: 9.130894346435898, MSE: 1090.042209654508, R²: 0.3491966222900791


## Neural Network MLP Regressor

In [9]:
from sklearn.neural_network import MLPRegressor

# Train the model
nn_model = MLPRegressor(random_state=42, max_iter=200)
nn_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_nn = nn_model.predict(X_test_scaled)

# Evaluate the model
nn_mae = mean_absolute_error(y_test, y_pred_nn)
nn_mse = mean_squared_error(y_test, y_pred_nn)
nn_r2 = r2_score(y_test, y_pred_nn)

print(f'Neural Network - MAE: {nn_mae}, MSE: {nn_mse}, R²: {nn_r2}')

Neural Network - MAE: 76.52659275443665, MSE: 7720.153421776161, R²: -3.609272814236429




## XG Boost Regressor

In [10]:
import xgboost as xgb

# Train the model
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=200)
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate the model
xgb_mae = mean_absolute_error(y_test, y_pred_xgb)
xgb_mse = mean_squared_error(y_test, y_pred_xgb)
xgb_r2 = r2_score(y_test, y_pred_xgb)

print(f'XGBoost - MAE: {xgb_mae}, MSE: {xgb_mse}, R²: {xgb_r2}')

XGBoost - MAE: 9.913790434120974, MSE: 995.336952276266, R²: 0.40573984680262576


## Lightgbm Regressor

In [12]:
import lightgbm as lgb

# Train the model
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lgb = lgb_model.predict(X_test_scaled)

# Evaluate the model
lgb_mae = mean_absolute_error(y_test, y_pred_lgb)
lgb_mse = mean_squared_error(y_test, y_pred_lgb)
lgb_r2 = r2_score(y_test, y_pred_lgb)

print(f'LightGBM - MAE: {lgb_mae}, MSE: {lgb_mse}, R²: {lgb_r2}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 1300, number of used features: 3
[LightGBM] [Info] Start training from score 341.548041
LightGBM - MAE: 10.082362792070422, MSE: 1082.3796628939037, R²: 0.3537715013814511


## Model Results

In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb

# Assuming you have preprocessed your data as shown earlier
# X_train_scaled, X_test_scaled, y_train, y_test

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR(),
    'Neural Network': MLPRegressor(random_state=42, max_iter=200),
    'XGBoost': xgb.XGBRegressor(random_state=42, n_estimators=200),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}

# Dictionary to store the results
results = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'R²': r2
    }

# Convert results to a DataFrame
results_df = pd.DataFrame(results).T

# Sort the results by R² score
results_df = results_df.sort_values(by='R²', ascending=False)

# Save the results to a CSV file for sharing
results_df.to_csv('model_results.csv')

# Print the results
print(results_df)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 1300, number of used features: 3
[LightGBM] [Info] Start training from score 341.548041
                                MAE          MSE        R²
XGBoost                    9.913790   995.336952  0.405740
Random Forest             10.016006  1043.048318  0.377254
LightGBM                  10.082363  1082.379663  0.353772
Support Vector Regressor   9.130894  1090.042210  0.349197
Linear Regression         10.291482  1096.150488  0.345550
Gradient Boosting          9.691241  1102.484768  0.341768
Neural Network            76.526593  7720.153422 -3.609273


## Polynominal Regression 

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [14]:
# Define the degree of the polynomial
degree = 1  # You can experiment with different degrees

# Generate polynomial features
poly = PolynomialFeatures(degree=degree)
X_poly_train = poly.fit_transform(X_train_scaled)
X_poly_test = poly.transform(X_test_scaled)

# Fit a Linear Regression model
poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)

# Make predictions
y_pred_poly = poly_model.predict(X_poly_test)

# Evaluate the model
poly_mae = mean_absolute_error(y_test, y_pred_poly)
poly_mse = mean_squared_error(y_test, y_pred_poly)
poly_r2 = r2_score(y_test, y_pred_poly)

print(f'Polynomial Regression (degree {degree}) - MAE: {poly_mae}, MSE: {poly_mse}, R²: {poly_r2}')




Polynomial Regression (degree 1) - MAE: 10.291481541374573, MSE: 1096.1504881836636, R²: 0.34554971002965607
Polynomial Regression (degree 2) - MAE: 9.518190104006903, MSE: 1084.9482317941583, R²: 0.352237952220363
Polynomial Regression (degree 3) - MAE: 9.603115983512268, MSE: 1101.6987526049156, R²: 0.3422371509435974
Polynomial Regression (degree 4) - MAE: 10.855994647239264, MSE: 1127.3631720627259, R²: 0.3269143581910112


In [15]:
# Polynomial Regression with degree 2
degree = 2
poly = PolynomialFeatures(degree=degree)
X_poly_train = poly.fit_transform(X_train_scaled)
X_poly_test = poly.transform(X_test_scaled)

poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)
y_pred_poly = poly_model.predict(X_poly_test)

poly_mae = mean_absolute_error(y_test, y_pred_poly)
poly_mse = mean_squared_error(y_test, y_pred_poly)
poly_r2 = r2_score(y_test, y_pred_poly)

print(f'Polynomial Regression (degree {degree}) - MAE: {poly_mae}, MSE: {poly_mse}, R²: {poly_r2}')

Polynomial Regression (degree 2) - MAE: 9.518190104006903, MSE: 1084.9482317941583, R²: 0.352237952220363


In [None]:
# Polynomial Regression with degree 3
degree = 3
poly = PolynomialFeatures(degree=degree)
X_poly_train = poly.fit_transform(X_train_scaled)
X_poly_test = poly.transform(X_test_scaled)

poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)
y_pred_poly = poly_model.predict(X_poly_test)

poly_mae = mean_absolute_error(y_test, y_pred_poly)
poly_mse = mean_squared_error(y_test, y_pred_poly)
poly_r2 = r2_score(y_test, y_pred_poly)

print(f'Polynomial Regression (degree {degree}) - MAE: {poly_mae}, MSE: {poly_mse}, R²: {poly_r2}')

In [None]:
# Polynomial Regression with degree 4
degree = 4
poly = PolynomialFeatures(degree=degree)
X_poly_train = poly.fit_transform(X_train_scaled)
X_poly_test = poly.transform(X_test_scaled)

poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)
y_pred_poly = poly_model.predict(X_poly_test)

poly_mae = mean_absolute_error(y_test, y_pred_poly)
poly_mse = mean_squared_error(y_test, y_pred_poly)
poly_r2 = r2_score(y_test, y_pred_poly)

print(f'Polynomial Regression (degree {degree}) - MAE: {poly_mae}, MSE: {poly_mse}, R²: {poly_r2}')

In [39]:
import numpy as np

# Create the input array
input_data = np.array([[27.125, 2, 200]])

input_features_normalized = scaler.transform(input_data)
# print(input_features_normalized)

# Transform the input data using polynomial features
input_data_poly = poly.transform(input_features_normalized)
# print(input_data_poly)

# Predict using the polynomial regression model
prediction = poly_model.predict(input_data_poly)
print(f'TDS Value: {prediction[0]:.2f} (mg/L)')

TDS Value: 350.29 (mg/L)




In [40]:
from sklearn.preprocessing import PolynomialFeatures

# Define the polynomial features object
print("Degree:",degree)
poly_features = PolynomialFeatures(degree=degree)

# Get the feature names
feature_names = poly.get_feature_names_out(input_features=['x1', 'x2', 'x3'])

# Rest of the code remains the same
intercept = poly_model.intercept_
coefficients = poly_model.coef_
equation = f"y = {intercept:.4f}"
for coef, feature in zip(coefficients, feature_names):
    equation += f" + ({coef:.4f}) * {feature}"
print("Polynomial Regression Equation:")
print(equation)

Degree: 2
Polynomial Regression Equation:
y = 832900019706.9500 + (0.0000) * 1 + (1.4046) * x1 + (-48713432252.4062) * x2 + (20.7860) * x3 + (-2.9069) * x1^2 + (4.7718) * x1 x2 + (-3.6989) * x1 x3 + (-832900019366.2006) * x2^2 + (0.6818) * x2 x3 + (-0.9937) * x3^2


### Internal Testing(Optinoal)

In [30]:
input_data = np.array([[27.125, 2, 200]])

print(input_features_normalized[0])
x1=input_features_normalized[0][0]
x2=input_features_normalized[0][1]
x3=input_features_normalized[0][2]
print(x1,x2,x3)
result=832900019706.9500 + (0.0000) * 1 + (1.4046) * x1 + (-48713432252.4062) * x2 + (20.7860) * x3 + (-2.9069) * (x1*x1) + (4.7718) * (x1*x2) + (-3.6989) * (x1*x3) + (-832900019366.2006) * (x2*x2) + (0.6818) * (x2*x3) + (-0.9937) * (x3*x3)
print(result)

[ 0.74913464  0.97118423 -0.28561575]
0.7491346376514885 0.9711842277268257 -0.2856157453678386
350.2948796148556


## FINAL MODEL

### Export Model

In [56]:
import joblib

# Assuming your XGBoost model is stored in the variable 'xgboost_model'
model =  xgb.XGBRegressor(random_state=42, n_estimators=200)

# Fit the model
model.fit(X_train_scaled, y_train)

# Define the file path for saving the model
file_path = 'Output/soil_XGBoost.pkl'

# Save the model to the file
joblib.dump(model, file_path)

['Output/soil_XGBoost.pkl']

In [58]:
# Load the saved model
model = joblib.load('/Users/likhithkanigolla/IIITH/code-files/Digital-Twin/ZF/backend/ml-models/Output/soil_XGBoost.pkl')

# Prepare the input data for prediction
input_data = [27.125,2,200]
input_features = np.array(input_data).reshape(1, -1)

input_features_normalized = scaler.transform(input_features)
print(input_features_normalized)
# Make predictions
predictions = model.predict(input_features_normalized)

# Print the predictions
print(predictions)

[[ 0.74913464  0.97118423 -0.28561575]]
[343.0992]


