In [2]:
import pandas as pd

df = pd.read_csv("cement_data.csv")

In [2]:
df.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

In [None]:
'Water  (component 4)(kg in a m^3 mixture)'
'Coarse Aggregate  (component 6)(kg in a m^3 mixture)'
'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)'


In [5]:
df.isna().sum()

Cement (component 1)(kg in a m^3 mixture)                0
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    0
Fly Ash (component 3)(kg in a m^3 mixture)               0
Water  (component 4)(kg in a m^3 mixture)                0
Superplasticizer (component 5)(kg in a m^3 mixture)      0
Coarse Aggregate  (component 6)(kg in a m^3 mixture)     0
Fine Aggregate (component 7)(kg in a m^3 mixture)        0
Age (day)                                                0
Concrete compressive strength(MPa, megapascals)          0
dtype: int64

In [12]:
for column in df.columns:
    for row in df.index:
        if df[column][row]=="na":
            print(f"na is present in {column} column in row no {row}")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Cement (component 1)(kg in a m^3 mixture)              1030 non-null   float64
 1   Blast Furnace Slag (component 2)(kg in a m^3 mixture)  1030 non-null   float64
 2   Fly Ash (component 3)(kg in a m^3 mixture)             1030 non-null   float64
 3   Water  (component 4)(kg in a m^3 mixture)              1030 non-null   float64
 4   Superplasticizer (component 5)(kg in a m^3 mixture)    1030 non-null   float64
 5   Coarse Aggregate  (component 6)(kg in a m^3 mixture)   1030 non-null   float64
 6   Fine Aggregate (component 7)(kg in a m^3 mixture)      1030 non-null   float64
 7   Age (day)                                              1030 non-null   int64  
 8   Concrete compressive strength(MPa, megapascals)  

In [None]:
# Import libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load data
df = pd.read_csv('cement_data.csv')

# Split data into features and target variable
X = df.drop(['strength'], axis=1)
y = df['strength']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pre-processing steps
preprocessor = StandardScaler()

# Define the models
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Random Forest Regression', RandomForestRegressor()),
    ('Gradient Boosting Regression', GradientBoostingRegressor()),
    ('Neural Network Regression', MLPRegressor())
]

# Define the pipeline for each model
pipelines = []
for name, model in models:
    pipelines.append((name, Pipeline([('preprocessor', preprocessor), (name, model)])))

# Define the parameter grid for GridSearchCV
param_grid = {
    'Linear Regression': {},
    'Ridge Regression': {'alpha': [0.01, 0.1, 1, 10]},
    'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
    'Random Forest Regression': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]},
    'Gradient Boosting Regression': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10], 'learning_rate': [0.01, 0.1, 1]},
    'Neural Network Regression': {'hidden_layer_sizes': [(50, 50), (100, 50, 25)], 'alpha': [0.0001, 0.001, 0.01]}
}

# Define the GridSearchCV object
grid = GridSearchCV(estimator=pipelines, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# Fit the grid search object to the training data
grid.fit(X_train, y_train)

# Print the best model and its hyperparameters
print("Best Model: ", grid.best_estimator_)
print("Best Parameters: ", grid.best_params_)

# Evaluate the best model on the testing data
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("MSE: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
