In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/rashakil-ds/Public-Datasets/main/concrete.csv")

In [3]:
data.head(5)

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [4]:
# Data Preprocessing

In [5]:
## Null Values

In [6]:
df = data.copy()

In [7]:
df.isna().sum()

Cement (component 1)(kg in a m^3 mixture)                0
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    0
Fly Ash (component 3)(kg in a m^3 mixture)               0
Water  (component 4)(kg in a m^3 mixture)                0
Superplasticizer (component 5)(kg in a m^3 mixture)      0
Coarse Aggregate  (component 6)(kg in a m^3 mixture)     0
Fine Aggregate (component 7)(kg in a m^3 mixture)        0
Age (day)                                                0
strength                                                 0
dtype: int64

In [8]:
## Duplicate Values

In [9]:
df.duplicated().sum()

25

In [10]:
df.drop_duplicates(inplace = True)

In [11]:
df.duplicated().sum()

0

In [12]:
## Scaling

In [13]:
from sklearn.preprocessing import MinMaxScaler

In [14]:
scaler = MinMaxScaler()

In [15]:
df.dtypes

Cement (component 1)(kg in a m^3 mixture)                float64
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    float64
Fly Ash (component 3)(kg in a m^3 mixture)               float64
Water  (component 4)(kg in a m^3 mixture)                float64
Superplasticizer (component 5)(kg in a m^3 mixture)      float64
Coarse Aggregate  (component 6)(kg in a m^3 mixture)     float64
Fine Aggregate (component 7)(kg in a m^3 mixture)        float64
Age (day)                                                  int64
strength                                                 float64
dtype: object

In [16]:
for col_name in df.columns:
    if col_name != "strength":
        df[col_name] = scaler.fit_transform(df[[col_name]])

In [17]:
df.head(5)

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),strength
0,1.0,0.0,0.0,0.321086,0.07764,0.694767,0.20572,0.074176,79.99
1,1.0,0.0,0.0,0.321086,0.07764,0.738372,0.20572,0.074176,61.89
2,0.526256,0.396494,0.0,0.848243,0.0,0.380814,0.0,0.739011,40.27
3,0.526256,0.396494,0.0,0.848243,0.0,0.380814,0.0,1.0,41.05
4,0.220548,0.368392,0.0,0.560703,0.0,0.515698,0.580783,0.986264,44.3


In [18]:
# Data Spliting

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X = df.drop("strength",axis=1)

In [21]:
y = df["strength"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
# Model Training

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor

In [25]:
lr = LinearRegression()
rfr = RandomForestRegressor()
xgb = XGBRegressor()

In [26]:
## Fit

In [27]:
lr.fit(X_train,y_train)

In [28]:
rfr.fit(X_train,y_train)

In [29]:
xgb.fit(X_train,y_train)

In [30]:
## predict

In [31]:
predicted_lr_train = lr.predict(X_train)

In [32]:
predicted_lr_test = lr.predict(X_test)

In [33]:
predicted_rfr_train = rfr.predict(X_train)

In [34]:
predicted_rfr_test = rfr.predict(X_test)

In [35]:
predicted_xgb_train = xgb.predict(X_train)

In [36]:
predicted_xgb_test = xgb.predict(X_test)

In [37]:
## Evaluation

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [39]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predicted_lr_test)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predicted_lr_test)
print(f'Mean Squared Error (MSE): {mse}')

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# R-squared (R2) score
r2 = r2_score(y_test, predicted_lr_test)
print(f'R-squared (R2) score: {r2}')

Mean Absolute Error (MAE): 8.984068581174695
Mean Squared Error (MSE): 126.20929107403164
Root Mean Squared Error (RMSE): 11.234290857639019
R-squared (R2) score: 0.560915111117331


In [40]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predicted_rfr_test)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predicted_rfr_test)
print(f'Mean Squared Error (MSE): {mse}')

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# R-squared (R2) score
r2 = r2_score(y_test, predicted_rfr_test)
print(f'R-squared (R2) score: {r2}')

Mean Absolute Error (MAE): 3.829094409492275
Mean Squared Error (MSE): 28.821520105856223
Root Mean Squared Error (RMSE): 5.368567789071515
R-squared (R2) score: 0.8997293000743798


In [41]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predicted_xgb_test)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predicted_xgb_test)
print(f'Mean Squared Error (MSE): {mse}')

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# R-squared (R2) score
r2 = r2_score(y_test, predicted_xgb_test)
print(f'R-squared (R2) score: {r2}')

Mean Absolute Error (MAE): 3.1672845013725834
Mean Squared Error (MSE): 23.230438446360438
Root Mean Squared Error (RMSE): 4.819796515036754
R-squared (R2) score: 0.9191807956679459


In [42]:
#HyperParameter Tuning

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}


In [45]:
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

In [46]:
best_params = grid_search.best_params_

In [47]:
best_xgb_regressor = XGBRegressor(**best_params)

In [48]:
best_xgb_regressor.fit(X_train,y_train)

In [49]:
predicted = best_xgb_regressor.predict(X_test)

In [50]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predicted)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predicted)
print(f'Mean Squared Error (MSE): {mse}')

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# R-squared (R2) score
r2 = r2_score(y_test, predicted)
print(f'R-squared (R2) score: {r2}')

Mean Absolute Error (MAE): 3.0564892466968256
Mean Squared Error (MSE): 22.040202464981448
Root Mean Squared Error (RMSE): 4.694699400918172
R-squared (R2) score: 0.9233216527251445


In [51]:
# Conclusion