In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm


df = pd.read_csv('Civil_Engineering_Regression_Dataset.csv')


print("First 5 rows of the dataset:")
print(df.head())


print("\nColumn Names in Dataset:")
print(df.columns.tolist())


df.columns = df.columns.str.strip()


independent_vars = ['Building_Height', 'Material_Quality_Index', 'Labor_Cost', 'Concrete_Strength', 'Foundation_Depth']
dependent_var = 'Construction_Cost'


missing_columns = [col for col in independent_vars + [dependent_var] if col not in df.columns]

if missing_columns:
    print(f"\nError: The following required columns are missing: {missing_columns}")
    print("Check column names and update accordingly.")
else:
   
    X = df[independent_vars]  
    y = df[dependent_var] 

    
    if X.isnull().sum().any() or y.isnull().sum().any():
        print("\nHandling missing values...")
        X.fillna(X.mean(), inplace=True) 
        y.fillna(y.mean(), inplace=True)  

   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    simple_model = LinearRegression()
    simple_model.fit(X_train[['Building_Height']], y_train) 

    
    simple_r2 = r2_score(y_test, simple_model.predict(X_test[['Building_Height']]))

   
    multiple_model = LinearRegression()
    multiple_model.fit(X_train, y_train)

  
    multiple_r2 = r2_score(y_test, multiple_model.predict(X_test))

  
    n = X_test.shape[0]  
    p = X_test.shape[1]  
    adjusted_r2 = 1 - (1 - multiple_r2) * ((n - 1) / (n - p - 1))

   
    print(f"\nComparison of R-squared values:")
    print(f"Simple Linear Regression R²: {simple_r2:.4f}")
    print(f"Multiple Linear Regression R²: {multiple_r2:.4f}")
    print(f"Adjusted R² for Multiple Regression: {adjusted_r2:.4f}")

   
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    print("\nVariance Inflation Factor (VIF) for each independent variable:")
    print(vif_data)

   
    if multiple_r2 > simple_r2:
        print("\nMultiple Linear Regression performs better than Simple Linear Regression as R² is higher.")
    else:
        print("\nSimple Linear Regression performs better than Multiple Linear Regression.")

    if adjusted_r2 < multiple_r2:
        print("However, the Adjusted R² is lower, indicating that some independent variables may not significantly contribute.")
    
    if vif_data["VIF"].max() > 10:
        print("\nWarning: High multicollinearity detected! Consider removing highly correlated variables.")
    
   
    print("\nConclusion on Model Performance:")
    print("1. Multiple regression explains more variance, but the adjusted R² suggests that not all features contribute significantly.")
    print("2. Multicollinearity detected; consider removing or transforming variables with high VIF.")
    print("3. This model can be used for predicting construction costs, but further improvement could be made by considering additional features.")



First 5 rows of the dataset:
   Project_ID  Building_Height  Material_Quality_Index  Labor_Cost  \
0           1        21.854305                       9   70.213332   
1           2        47.782144                       9  142.413614   
2           3        37.939727                       3  110.539985   
3           4        31.939632                       6  250.784939   
4           5        12.020839                       7  167.575159   

   Concrete_Strength  Foundation_Depth  Weather_Index  Construction_Cost  
0          45.326394          8.804790              4        2400.287931  
1          47.900505          6.727632              6        3705.461312  
2          22.112484          8.208544              8        2653.631004  
3          26.267562          7.094515              4        2534.099466  
4          40.134306          6.160303              6        1741.179333  

Column Names in Dataset:
['Project_ID', 'Building_Height', 'Material_Quality_Index', 'Labor_Cost', 