In [None]:
%pip install pandas numpy seaborn matplotlib plotly scikit-learn

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, confusion_matrix , classification_report, mean_squared_error


In [2]:
df = pd.read_excel("/Users/likhithkanigolla/IIITH/code-files/Digital-Twin/ZF/Soil_test/Soil Data.xlsx")
df = df.drop(columns=["entry_id","tdsValue_without_temp","Voltage"])
df.head()

Unnamed: 0,Temp,tdsValue,Quantity,Soil
0,29.9375,275.95312,1,0
1,29.875,278.30084,1,0
2,29.75,287.14365,1,0
3,29.6875,279.20163,1,0
4,29.1875,291.01865,1,0


In [3]:
df.columns
print("Columns:", df.columns)

df.describe()
print("Description: ",df.describe())

df.info()
print("Info", df.info())

df.isnull().sum()
print("Null Values:", df.isnull().sum())


Columns: Index(['Temp', 'tdsValue', 'Quantity', 'Soil '], dtype='object')
Description:                Temp     tdsValue     Quantity        Soil 
count  1626.000000  1626.000000  1626.000000  1626.000000
mean     26.278137   341.108962     1.511685   246.186962
std       1.111388    36.820004     0.500017   164.702733
min      24.562500     2.671130     1.000000     0.000000
25%      25.437500   328.106810     1.000000   100.000000
50%      25.937500   346.682250     2.000000   300.000000
75%      27.187500   361.401142     2.000000   400.000000
max      29.937500   433.114870     2.000000   500.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1626 entries, 0 to 1625
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Temp      1626 non-null   float64
 1   tdsValue  1626 non-null   float64
 2   Quantity  1626 non-null   int64  
 3   Soil      1626 non-null   int64  
dtypes: float64(2), int64(2)
memory usage: 50.9 KB

In [None]:
print("Heat Map: ")
plt.figure(figsize= (12,8))
sns.heatmap(df.corr(), annot=True)

In [None]:
sns.countplot(x="tdsValue", data=df)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 8))

for i, col in enumerate(df.columns):
    sns.boxplot(x=col, data=df, ax=axes[i // 3, i % 3])
    axes[i // 3, i % 3].set_title(col)

plt.tight_layout()
plt.show()


In [None]:
sns.pairplot(df)

In [4]:
x = df.drop("tdsValue", axis=1)
y = df["tdsValue"]

In [5]:
x.shape, y.shape

((1626, 3), (1626,))

In [None]:
# scaler = StandardScaler()
# x = scaler.fit_transform(x)
# x

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

In [8]:
X_train_scaled.shape, X_test_scaled.shape

((1300, 3), (326, 3))

In [9]:
plt.figure(figsize=(10, 10))

<Figure size 1000x1000 with 0 Axes>

<Figure size 1000x1000 with 0 Axes>

## **Logistic Regression**

In [None]:
#Logistic Regression
#Take Value of Sand to test this
x = df.drop("tdsValue", axis=1)
y = df["tdsValue"]
#object creation
model_log_r= LogisticRegression()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
model_log_r.fit(x_train, y_train)

In [None]:
#Making Prediction
pred_log_r = model_log_r.predict(x_test)

In [None]:
#accuracy score
accuracy_score_log = accuracy_score(y_test,pred_log_r)
accuracy_score_log

In [None]:
#Decision Tree

## **Decision Tree**


In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(max_depth = 8)

In [None]:
model_dt.fit(x_train, y_train)

In [None]:
pred_dt = model_dt.predict(x_test)

In [None]:
accuracy_score_dt = accuracy_score(y_test, pred_dt)
accuracy_score_dt

In [None]:
cm2 = confusion_matrix(y_test, pred_dt)
cm2
# sns.heatmap(cm2/np.sum(cm2))

## **Linear Regression**

In [None]:
model_lin_r = LinearRegression()
model_lin_r.fit(x_train, y_train)
y_pred_lr = model_lin_r.predict(x_test)
# print(pred_lin_r)

mse_lr = mean_squared_error(y_test,y_pred_lr)
intercept = model_lin_r.intercept_
coefficients = model_lin_r.coef_
print("Linear Regression Equation:")
# print(f"y = {intercept} + {coefficients[0]} * x1 + {coefficients[1]} * x2 + ...")
print(len(coefficients))
print(coefficients)
print(intercept)

In [None]:
# Input feature values
input_features = [25.5625, 2, 400]

# Reshape the input features to match the shape expected by the model
input_features = np.array(input_features).reshape(1, -1)

# Scale the input features using the scaler used during training
input_features = scaler.transform(input_features)

# Predict the values using the trained model
predicted_tds = model_lin_r.predict(input_features)

# Print the predicted values
print("Predicted TDS values:", predicted_tds)

## **Polynomial Regression**

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming x_train, x_test, y_train, y_test are already defined

degree = 2
poly_features = PolynomialFeatures(degree=degree)

x_poly_train = poly_features.fit_transform(X_train_scaled)
x_poly_test = poly_features.transform(X_test_scaled)

poly_regressor = LinearRegression()
poly_regressor.fit(x_poly_train, y_train)
y_pred_pr = poly_regressor.predict(x_poly_test)

mse_pr = mean_squared_error(y_test, y_pred_pr)
print("Mean Squared Error:", mse_pr)

intercept = poly_regressor.intercept_
coefficients = poly_regressor.coef_
print("Linear Regression Equation:")
# print(f"y = {intercept} + {coefficients[0]} * x1 + {coefficients[1]} * x2 + ...")
print(intercept)
print(coefficients)



Mean Squared Error: 1084.9482317941583
Linear Regression Equation:
832900019706.95
[ 0.00000000e+00  1.40455774e+00 -4.87134323e+10  2.07860492e+01
 -2.90692165e+00  4.77178550e+00 -3.69891308e+00 -8.32900019e+11
  6.81802084e-01 -9.93731546e-01]


In [12]:
# Assuming you have the feature_values as input
	# 350.90677	
feature_values = [27.125,2,200]

# Reshape the feature values to match the shape expected by the scaler
input_features = np.array(feature_values).reshape(1, -1)

# Normalize the input features using the StandardScaler
input_features_normalized = scaler.transform(input_features)
print(input_features_normalized)

test_values=poly_features.transform(input_features)
print("Test Values:", test_values)
# Use the normalized input features for prediction
predicted_tds = poly_regressor.predict(poly_features.transform(input_features_normalized))

# Print the predicted tds value
print("Predicted TDS:", predicted_tds)

[[ 0.74913464  0.97118423 -0.28561575]]
Test Values: [[1.00000000e+00 2.71250000e+01 2.00000000e+00 2.00000000e+02
  7.35765625e+02 5.42500000e+01 5.42500000e+03 4.00000000e+00
  4.00000000e+02 4.00000000e+04]]
Predicted TDS: [350.2947998]




## **Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

#TESTED TILL 30
rf_regressor = RandomForestRegressor(random_state=42, max_depth = 16)
rf_regressor.fit(x_train, y_train)
y_pred_rf = rf_regressor.predict(x_test)

mse_rfr = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error:", mse_rfr)

plt.subplot(1, 3, 3)
plt.scatter(y_test, y_pred_rf, color='orange')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest Regression')
plt.grid(True)

plt.tight_layout()
plt.show()

## **Decision Tree Regression**

In [None]:
from sklearn.tree import DecisionTreeRegressor

#TESTED TILL 30
dt_regressor = DecisionTreeRegressor(random_state=99, max_depth = 16)
dt_regressor.fit(x_train, y_train)
y_pred_dt = dt_regressor.predict(x_test)
mse_dtr = mean_squared_error(y_test, y_pred_dt)
print("Mean Squared Error:", mse_dtr)

plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred_dt, color='green')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Decision Tree Regression')
plt.grid(True)

## **KNeighborsRegressor**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
k = 1
#TESTED TILL 15
knn_regressor = KNeighborsRegressor(n_neighbors=1)
knn_regressor.fit(x_train, y_train)
y_pred_knn = knn_regressor.predict(x_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
print("Mean Squared Error:", mse_knn)

plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred_knn, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('KNN Regression')
plt.grid(True)

## **TEST RESULTS**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from tabulate import tabulate

# Calculate MSE for each model
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_rf = mean_squared_error(y_test, y_pred_rf)
mse_dt = mean_squared_error(y_test, y_pred_dt)
mse_knn = mean_squared_error(y_test, y_pred_knn)
mse_pr = mean_squared_error(y_test, y_pred_pr)

# Calculate RMSE for each model
rmse_lr = np.sqrt(mse_lr)
rmse_rf = np.sqrt(mse_rf)
rmse_dt = np.sqrt(mse_dt)
rmse_knn = np.sqrt(mse_knn)
rmse_pr = np.sqrt(mse_pr)

# Calculate R2 score for each model
r2_lr = r2_score(y_test, y_pred_lr)
r2_rf = r2_score(y_test, y_pred_rf)
r2_dt = r2_score(y_test, y_pred_dt)
r2_knn = r2_score(y_test, y_pred_knn)
r2_pr = r2_score(y_test, y_pred_pr)

data = [
    ["Linear Regression", mse_lr, rmse_lr, r2_lr],
    ["Random Forest Regression", mse_rf, rmse_rf, r2_rf],
    ["Decision Tree Regression", mse_dt, rmse_dt, r2_dt],
    ["KNN Regression", mse_knn, rmse_knn, r2_knn],
    ["Polynomial Regression", mse_pr, rmse_pr, r2_pr]
]

# Define table headers
headers = ["Regression Model", "MSE", "RMSE", "R-squared"]

# Print the table using tabulate
print(tabulate(data, headers=headers, tablefmt="pretty"))

