In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

# Load the CSV file into a pandas DataFrame
df = pd.read_csv("concrete_data.csv")

# Split the data into features (X) and target variable (y)
X = df.drop(columns=['concrete_compressive_strength'])
y = df['concrete_compressive_strength']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Instantiate all models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Support Vector Machine": SVR(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost": XGBRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "KNN": KNeighborsRegressor(),
    "ANN": MLPRegressor(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Elastic Net Regression": ElasticNet()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, predictions)
    print(f"{name}: Mean Absolute Error = {mae}")


Linear Regression: Mean Absolute Error = 7.745559243921434
Decision Tree: Mean Absolute Error = 4.241626213592233
Support Vector Machine: Mean Absolute Error = 7.51494696620429
Random Forest: Mean Absolute Error = 3.7794801982200665
XGBoost: Mean Absolute Error = 2.996374957538346
AdaBoost: Mean Absolute Error = 6.415728567746745
Gradient Boosting: Mean Absolute Error = 4.138748889593647
KNN: Mean Absolute Error = 6.800514563106796
ANN: Mean Absolute Error = 9.02367941260824
Ridge Regression: Mean Absolute Error = 7.751966725393744
Lasso Regression: Mean Absolute Error = 8.716246800286958
Elastic Net Regression: Mean Absolute Error = 9.232295635942508




In [15]:
df.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [2]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [1000, 1500, 2000],
    'max_depth': [3, 5, 6],
    'learning_rate': [0.3, 0.1, 0.2]
}

# Initialize the XGBoost regressor
xgb_reg = xgb.XGBRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')

# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator to make predictions on the test data
best_xgb = grid_search.best_estimator_
predictions = best_xgb.predict(X_test_scaled)

# Calculate and print the mean absolute error
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error:", mae)
mse = mean_squared_error(y_test, predictions)
print("XGBoost - Mean Squared Error:", mse)

# Fit the best XGBoost estimator to the training data
best_xgb.fit(X_train_scaled, y_train)

# Calculate and print the R^2 score on the training and test data
train_score = best_xgb.score(X_train_scaled, y_train)
test_score = best_xgb.score(X_test_scaled, y_test)

print("XGBoost - Training R^2 Score:", train_score)
print("XGBoost - Test R^2 Score:", test_score)

# Evaluate the model using accuracy

print('MSE:', mean_squared_error(y_test, predictions))


Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 2000}
Mean Absolute Error: 2.796709563153461
XGBoost - Mean Squared Error: 17.234885287026657
XGBoost - Training R^2 Score: 0.9936956586664092
XGBoost - Test R^2 Score: 0.93311435497694
MSE: 17.234885287026657


Best Parameters: {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 1000}
Mean Absolute Error: 2.756640979896471

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 2000}
Mean Absolute Error: 2.796709563153461
XGBoost - Mean Squared Error: 17.234885287026657
XGBoost - Training R^2 Score: 0.9936956586664092
XGBoost - Test R^2 Score: 0.93311435497694
Accuracy DT: 0.93311435497694
MSE: 17.234885287026657

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump

# Load the CSV file into a pandas DataFrame
df = pd.read_csv("concrete_data.csv")

# Split the data into features (X) and target variable (y)
X = df.drop(columns=['concrete_compressive_strength'])
y = df['concrete_compressive_strength']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [1000, 1500, 2000],
    'max_depth': [3, 5, 6],
    'learning_rate': [0.3, 0.1, 0.2]
}

# Initialize the XGBoost regressor
xgb_reg = xgb.XGBRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')

# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator to make predictions on the test data
best_xgb = grid_search.best_estimator_
predictions = best_xgb.predict(X_test_scaled)

# Calculate and print the mean absolute error
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error:", mae)

# Calculate and print the mean squared error
mse = mean_squared_error(y_test, predictions)
print("XGBoost - Mean Squared Error:", mse)

# Fit the best XGBoost estimator to the training data
best_xgb.fit(X_train_scaled, y_train)

# Calculate and print the R^2 score on the training and test data
train_score = best_xgb.score(X_train_scaled, y_train)
test_score = best_xgb.score(X_test_scaled, y_test)

print("XGBoost - Training R^2 Score:", train_score)
print("XGBoost - Test R^2 Score:", test_score)

# Save the trained model to a .pkl file
dump(best_xgb, 'model.pkl')

# Save the scaler
dump(scaler, 'scaler.pkl')


Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 2000}
Mean Absolute Error: 2.796709563153461
XGBoost - Mean Squared Error: 17.234885287026657
XGBoost - Training R^2 Score: 0.9936956586664092
XGBoost - Test R^2 Score: 0.93311435497694


['scaler.pkl']

In [5]:
X_train

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age
995,158.6,148.9,116.0,175.1,15.0,953.3,719.7,28
507,424.0,22.0,132.0,178.0,8.5,822.0,750.0,28
334,275.1,0.0,121.4,159.5,9.9,1053.6,777.5,3
848,252.0,97.0,76.0,194.0,8.0,835.0,821.0,28
294,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,3
...,...,...,...,...,...,...,...,...
87,286.3,200.9,0.0,144.7,11.2,1004.6,803.7,3
330,246.8,0.0,125.1,143.3,12.0,1086.8,800.9,14
466,190.3,0.0,125.2,166.6,9.9,1079.0,798.9,100
121,475.0,118.8,0.0,181.1,8.9,852.1,781.5,28


In [6]:
print(X_train.head())
print("Number of features in X_train:", X_train.shape[1])

     cement  blast_furnace_slag  fly_ash  water  superplasticizer  \
995   158.6               148.9    116.0  175.1              15.0   
507   424.0                22.0    132.0  178.0               8.5   
334   275.1                 0.0    121.4  159.5               9.9   
848   252.0                97.0     76.0  194.0               8.0   
294   168.9                42.2    124.3  158.3              10.8   

     coarse_aggregate  fine_aggregate   age  
995             953.3            719.7   28  
507             822.0            750.0   28  
334            1053.6            777.5    3  
848             835.0            821.0   28  
294            1080.8            796.2    3  
Number of features in X_train: 8


In [7]:
input_data = {
    'cement': [350, 400, 450, 500, 550, 300, 380, 420, 470, 510],
    'blast_furnace_slag': [0, 20, 40, 60, 80, 100, 120, 140, 160, 180],
    'fly_ash': [0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
    'water': [150, 160, 170, 180, 190, 200, 210, 220, 230, 240],
    'superplasticizer': [0, 5, 10, 15, 20, 25, 30, 35, 40, 45],
    'coarse_aggregate': [1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450],
    'fine_aggregate': [700, 720, 740, 760, 780, 800, 820, 840, 860, 880],
    'age': [28, 28, 28, 28, 28, 28, 28, 28, 28, 28]
}

# Convert input data into a DataFrame
input_df = pd.DataFrame(input_data)

# Scale the input data
input_df_scaled = scaler.transform(input_df)

# Make predictions using the trained XGBoost model
predictions = best_xgb.predict(input_df_scaled)

# Print predictions
for i, pred in enumerate(predictions):
    print(f"Prediction for input {i+1}: {pred}")

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- fine_aggregate
Feature names seen at fit time, yet now missing:
- fine_aggregate 


In [None]:
# Check the feature names used during training
print("Feature names used during training:", X_train.columns.tolist())

# Ensure that the input DataFrame for prediction has the same column names in the same order
input_df = pd.DataFrame(input_data, columns=X_train.columns)

# Scale the input data
input_df_scaled = scaler.transform(input_df)

# Make predictions using the trained XGBoost model
predictions = best_xgb.predict(input_df_scaled)

# Print predictions
for i, pred in enumerate(predictions):
    print(f"Prediction for input {i+1}: {pred}")


Feature names used during training: ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate ', 'age']
Prediction for input 1: 50.67439651489258
Prediction for input 2: 51.64189529418945
Prediction for input 3: 45.82851028442383
Prediction for input 4: 68.34516143798828
Prediction for input 5: 63.9072380065918
Prediction for input 6: 38.337215423583984
Prediction for input 7: 43.74569320678711
Prediction for input 8: 40.13999938964844
Prediction for input 9: 60.53358459472656
Prediction for input 10: 66.7478256225586


In [8]:
import time


# Record start time
start_time = time.time()

# Make predictions
prediction = best_xgb.predict(X_train_scaled)

# Record end time
end_time = time.time()

# Calculate latency
latency = end_time - start_time
print("Latency (Response Time):", latency, "seconds")


Latency (Response Time): 0.011525869369506836 seconds


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump
import mlflow

# Load the CSV file into a pandas DataFrame
df = pd.read_csv("concrete_data.csv")

# Split the data into features (X) and target variable (y)
X = df.drop(columns=['concrete_compressive_strength'])
y = df['concrete_compressive_strength']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [1000, 1500, 2000],
    'max_depth': [3, 5, 6],
    'learning_rate': [0.3, 0.1, 0.2]
}

# Initialize the XGBoost regressor
xgb_reg = xgb.XGBRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')

# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator to make predictions on the test data
best_xgb = grid_search.best_estimator_
predictions = best_xgb.predict(X_test_scaled)

# Calculate and print the mean absolute error
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error:", mae)

# Calculate and print the mean squared error
mse = mean_squared_error(y_test, predictions)
print("XGBoost - Mean Squared Error:", mse)

# Fit the best XGBoost estimator to the training data
best_xgb.fit(X_train_scaled, y_train)

# Calculate and print the R^2 score on the training and test data
train_score = best_xgb.score(X_train_scaled, y_train)
test_score = best_xgb.score(X_test_scaled, y_test)

print("XGBoost - Training R^2 Score:", train_score)
print("XGBoost - Test R^2 Score:", test_score)

# Log metrics with MLflow
with mlflow.start_run():
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("Training_R2_Score", train_score)
    mlflow.log_metric("Test_R2_Score", test_score)

    # Save the trained model to a .pkl file
    dump(best_xgb, 'model.pkl')

    # Save the scaler
    dump(scaler, 'scaler.pkl')


Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 2000}
Mean Absolute Error: 2.796709563153461
XGBoost - Mean Squared Error: 17.234885287026657
XGBoost - Training R^2 Score: 0.9936956586664092
XGBoost - Test R^2 Score: 0.93311435497694
