In [83]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import sklearn.metrics as metrics
from statsmodels.tools.eval_measures import rmse

In [75]:
df =  pd.read_csv('Cleaned_Data.csv')
df = df.drop(columns=['DEATH_EVENT','AgeRange'])
train, test = train_test_split(df, test_size=0.2)
train = pd.get_dummies(train, columns=['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'], dtype=int, drop_first=True)
test = pd.get_dummies(test, columns=['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'], dtype=int, drop_first=True)
test.head(3)

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,time,anaemia_True,diabetes_True,high_blood_pressure_True,sex_True,smoking_True
96,63,514,25,254000.0,1.3,134,83,1,1,1,1,0
112,50,369,25,252000.0,1.6,136,90,0,1,0,1,0
238,65,720,40,257000.0,1.0,136,210,1,1,0,0,0


In [123]:
X = train.drop(columns='time')
y_model = train['time']
model = LinearRegression()
model.fit(X, y_model)
model_p = sm.OLS.from_formula('time ~ ' + '+'.join(train.columns.difference(['time'])), train)
model_p = model_p.fit()
model_p.summary()

0,1,2,3
Dep. Variable:,time,R-squared:,0.124
Model:,OLS,Adj. R-squared:,0.082
Method:,Least Squares,F-statistic:,2.928
Date:,"Thu, 18 Jan 2024",Prob (F-statistic):,0.00123
Time:,19:55:34,Log-Likelihood:,-1363.6
No. Observations:,239,AIC:,2751.0
Df Residuals:,227,BIC:,2793.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-46.3002,154.919,-0.299,0.765,-351.564,258.963
age,-0.7982,0.439,-1.820,0.070,-1.662,0.066
anaemia_True,-26.0735,10.130,-2.574,0.011,-46.035,-6.112
creatinine_phosphokinase,-0.0002,0.006,-0.027,0.979,-0.012,0.012
diabetes_True,8.0682,10.129,0.797,0.427,-11.890,28.027
ejection_fraction,0.1942,0.420,0.462,0.644,-0.634,1.022
high_blood_pressure_True,-35.3537,10.427,-3.390,0.001,-55.901,-14.807
platelets,-3.744e-05,5.03e-05,-0.745,0.457,-0.000,6.16e-05
serum_creatinine,-3.3357,5.163,-0.646,0.519,-13.509,6.838

0,1,2,3
Omnibus:,70.217,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12.149
Skew:,0.011,Prob(JB):,0.0023
Kurtosis:,1.896,Cond. No.,9040000.0


In [124]:
validation_data = test.drop(columns='time')
y_pred = model.predict(validation_data)
y_true = test['time']


def regression_results(y_true, y_pred):

    # Regression metrics
    mse=metrics.mean_squared_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred)
    
    
    print('Mean Squared Error: ', round(mse,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))

regression_results(y_true, y_pred)

Mean Squared Error:  5741.9979
r2:  0.0133
MAE:  67.2662
