In [2]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS

In [3]:
df_statlog=pd.read_csv('capstone datasets/Heart Disease Statlog/Heart_disease_statlog.csv')

In [4]:
df_statlog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       270 non-null    int64  
 1   sex       270 non-null    int64  
 2   cp        270 non-null    int64  
 3   trestbps  270 non-null    int64  
 4   chol      270 non-null    int64  
 5   fbs       270 non-null    int64  
 6   restecg   270 non-null    int64  
 7   thalach   270 non-null    int64  
 8   exang     270 non-null    int64  
 9   oldpeak   270 non-null    float64
 10  slope     270 non-null    int64  
 11  ca        270 non-null    int64  
 12  thal      270 non-null    int64  
 13  target    270 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 29.7 KB


In [5]:
df_statlog.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,2.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,0.585185,0.67037,1.822222,0.444444
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,0.95914,0.497827
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,2.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,55.0,1.0,2.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,1.0,0.0,1.0,0.0
75%,61.0,1.0,3.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,1.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,3.0,3.0,1.0


In [10]:
X=df_statlog.drop(labels='target',axis=1)
y=df_statlog['target']

X_const = sm.add_constant(X)

model = sm.OLS(y, X_const)
results = model.fit()

# Step 4: View summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.523
Method:                 Least Squares   F-statistic:                     23.72
Date:                Sun, 01 Jun 2025   Prob (F-statistic):           6.48e-37
Time:                        11:01:17   Log-Likelihood:                -87.553
No. Observations:                 270   AIC:                             203.1
Df Residuals:                     256   BIC:                             253.5
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2241      0.309     -0.726      0.4

#### Adding polynomial terms to the above feature dataset

In [8]:
from sklearn.preprocessing import PolynomialFeatures

In [9]:
# Step 1: Create polynomial features (e.g., degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Step 2: Add constant term for intercept
X_poly_const = sm.add_constant(X_poly)

# Step 3: Fit the OLS model
model = sm.OLS(y, X_poly_const)
results = model.fit()

# Step 4: View summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.713
Model:                            OLS   Adj. R-squared:                  0.540
Method:                 Least Squares   F-statistic:                     4.128
Date:                Sun, 01 Jun 2025   Prob (F-statistic):           2.56e-16
Time:                        11:00:53   Log-Likelihood:                -25.868
No. Observations:                 270   AIC:                             255.7
Df Residuals:                     168   BIC:                             622.8
Df Model:                         101                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.7518      4.182      1.375      0.1

#### Now let's increase the polynomial terms to 3rd degree and see if R squared is getting improved

In [12]:
# Step 1: Create polynomial features (e.g., degree=3)
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(X)

# Step 2: Add constant term for intercept
X_poly_const = sm.add_constant(X_poly)

# Step 3: Fit the OLS model
model = sm.OLS(y, X_poly_const)
results = model.fit()

# Step 4: View summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 01 Jun 2025   Prob (F-statistic):                nan
Time:                        11:04:31   Log-Likelihood:                 5999.9
No. Observations:                 270   AIC:                        -1.146e+04
Df Residuals:                       0   BIC:                        -1.049e+04
Df Model:                         269                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0001        inf         -0        n

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


#### It clearly shows the sign of overfitting.