### Get the dataset from: https://www.kaggle.com/datasets/dileep070/heart-disease-prediction-using-logistic-regression?select=framingham.csv


### Meaning of each feature:

#### Demographic:
`male` (Categorical): whether or not the patient is male

`age` (Numerical): Age of the patient

#### Behavioral:
`education` (Categorical):    
    1: 0-11 years.
    2: High School Diploma, GED.
    3: Some College, Vocational School.
    4: College (BS, BA) degree or more.

`currentSmoker` (Categorical): whether or not the patient is a current smoker

`cigsPerDay`(Numerical): the number of cigarettes that the person smoked on average in one day

#### Medical (history):
`BPMeds` (Categorical): whether or not the patient was on blood pressure medication

`prevalentStroke` (Categorical): whether or not the patient had previously had a stroke

`prevalentHyp` (Categorical): whether or not the patient was hypertensive 

`diabetes` (Categorical): whether or not the patient had diabetes
    
#### Medical (current):
`totChol` (Numerical): total cholesterol level

`sysBP` (Numerical): systolic blood pressure

`diaBP` (Numerical): diastolic blood pressure

`BMI` (Numerical): Body Mass Index

`heartRate` (Numerical): heart rate

`glucose` (Numerical): glucose level

#### Target:

`TenYearCHD` (Categorical): ten year risk of coronary heart disease (1: Yes, 0: No)

In [83]:
import pandas as pd
df = pd.read_csv('framingham.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [84]:
df.columns.values

array(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay',
       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype=object)

In [85]:
df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4238.0,4238.0,4133.0,4238.0,4209.0,4185.0,4238.0,4238.0,4238.0,4188.0,4238.0,4238.0,4219.0,4237.0,3850.0,4238.0
mean,0.429212,49.584946,1.97895,0.494101,9.003089,0.02963,0.005899,0.310524,0.02572,236.721585,132.352407,82.893464,25.802008,75.878924,81.966753,0.151958
std,0.495022,8.57216,1.019791,0.500024,11.920094,0.169584,0.076587,0.462763,0.158316,44.590334,22.038097,11.91085,4.080111,12.026596,23.959998,0.359023
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,89.875,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [86]:
len(df)

4238

In [87]:
df.dtypes

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

In [88]:
df.nunique()

male                  2
age                  39
education             4
currentSmoker         2
cigsPerDay           33
BPMeds                2
prevalentStroke       2
prevalentHyp          2
diabetes              2
totChol             248
sysBP               234
diaBP               146
BMI                1363
heartRate            73
glucose             143
TenYearCHD            2
dtype: int64

In [89]:
df.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [90]:
df['education'] = df['education'].fillna(df['education'].mode()[0])

df.loc[df['currentSmoker']==1, 'cigsPerDay'] = df.loc[df['currentSmoker']==1, 'cigsPerDay'].fillna(df.loc[df['currentSmoker']==1, 'cigsPerDay'].mode()[0])
df.loc[df['currentSmoker']==0, 'cigsPerDay'] = df.loc[df['currentSmoker']==0, 'cigsPerDay'].fillna(0)


df['BPMeds'] = df['BPMeds'].fillna(0)
df['totChol'] = df['totChol'].fillna(df['totChol'].median())
df['BMI'] = df['BMI'].fillna(df['BMI'].mode()[0])
df['heartRate'] = df['heartRate'].fillna(df['heartRate'].median())

df.loc[df['diabetes']==1, 'glucose'] = df.loc[df['diabetes']==1, 'glucose'].fillna(df.loc[df['diabetes']==1, 'glucose'].mode()[0])
df.loc[df['diabetes']==0, 'glucose'] = df.loc[df['diabetes']==0, 'glucose'].fillna(0)

df.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

Mapping the categorical columns to meaningful category.

In [91]:
df['male'] = df['male'].map({1:'male', 0:'female'})
df['education'] = df['education'].map({1:'less_than_high_school', 2:'high_school', 3:'some_college_or_vocational_school', 4:'college_or_above'}) 
df['currentSmoker'] = df['currentSmoker'].map({1:'smoker', 0:'non_smoker'})
df['BPMeds'] = df['BPMeds'].map({1:'on_bp_meds', 0:'not_on_bp_meds'})
df['diabetes'] = df['diabetes'].map({1:'diabetic', 0:'non_diabetic'})

In [92]:
df.describe()

Unnamed: 0,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0
mean,49.584946,9.078339,0.005899,0.310524,236.689476,132.352407,82.893464,25.785814,75.878716,74.575743,0.151958
std,8.57216,11.91378,0.076587,0.462763,44.327427,22.038097,11.91085,4.0781,12.025185,32.8198,0.359023
min,32.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,0.0,0.0
25%,42.0,0.0,0.0,0.0,206.0,117.0,75.0,23.05,68.0,68.0,0.0
50%,49.0,0.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,77.0,0.0
75%,56.0,20.0,0.0,1.0,262.0,144.0,89.875,28.0375,83.0,85.0,0.0
max,70.0,70.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


Formatting the columns to become more reabable.

In [93]:
numericals_rename = {'age':'age',
                    'cigsPerDay':'cigarettes_per_day',
                    'totChol':'total_cholesterol',
                    'sysBP':'systolic_blood_pressure',
                    'diaBP':'diasolic_blood_pressure',
                    'BMI':'bmi',
                    'heartRate':'heart_rate',
                    'glucose':'glucose_level'
}

categoricals_rename = {'male':'gender',
               'education':'education_level',
               'currentSmoker':'smoker',
               'BPMeds':'blood_pressure_medication',
               'prevalentStroke':'had_a_stroke',
               'prevalentHyp':'hypertensive',
               'diabetes':'diabetes',
               'TenYearCHD':'10year_chd_risk'}

numericals = list(numericals_rename.values())
categoricals = list(categoricals_rename.values())
categoricals.remove('10year_chd_risk')

df = df.rename(columns=numericals_rename)
df = df.rename(columns=categoricals_rename)
df.head()


Unnamed: 0,gender,age,education_level,smoker,cigarettes_per_day,blood_pressure_medication,had_a_stroke,hypertensive,diabetes,total_cholesterol,systolic_blood_pressure,diasolic_blood_pressure,bmi,heart_rate,glucose_level,10year_chd_risk
0,male,39,college_or_above,non_smoker,0.0,not_on_bp_meds,0,0,non_diabetic,195.0,106.0,70.0,26.97,80.0,77.0,0
1,female,46,high_school,non_smoker,0.0,not_on_bp_meds,0,0,non_diabetic,250.0,121.0,81.0,28.73,95.0,76.0,0
2,male,48,less_than_high_school,smoker,20.0,not_on_bp_meds,0,0,non_diabetic,245.0,127.5,80.0,25.34,75.0,70.0,0
3,female,61,some_college_or_vocational_school,smoker,30.0,not_on_bp_meds,0,1,non_diabetic,225.0,150.0,95.0,28.58,65.0,103.0,1
4,female,46,some_college_or_vocational_school,smoker,23.0,not_on_bp_meds,0,0,non_diabetic,285.0,130.0,84.0,23.1,85.0,85.0,0


Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution. Use `train_test_split` function for that with `random_state=1`

In [94]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

y_train = df_train['10year_chd_risk'].values
y_val = df_val['10year_chd_risk'].values
y_test = df_test['10year_chd_risk'].values

del df_train['10year_chd_risk']
del df_val['10year_chd_risk']
del df_test['10year_chd_risk']

print(f'Data for total: {len(df)}, train: {len(df_train)}, val: {len(df_val)}, test: {len(df_test)}')

Data for total: 4238, train: 2542, val: 848, test: 848


In [95]:
train_dict = df_train[categoricals + numericals].to_dict(orient='records')
train_dict[0]

{'gender': 'female',
 'education_level': 'college_or_above',
 'smoker': 'smoker',
 'blood_pressure_medication': 'not_on_bp_meds',
 'had_a_stroke': 0,
 'hypertensive': 0,
 'diabetes': 'non_diabetic',
 'age': 41,
 'cigarettes_per_day': 20.0,
 'total_cholesterol': 243.0,
 'systolic_blood_pressure': 97.0,
 'diasolic_blood_pressure': 63.0,
 'bmi': 22.53,
 'heart_rate': 76.0,
 'glucose_level': 64.0}

In [96]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
dv.get_feature_names_out()

array(['age', 'blood_pressure_medication=not_on_bp_meds',
       'blood_pressure_medication=on_bp_meds', 'bmi',
       'cigarettes_per_day', 'diabetes=diabetic', 'diabetes=non_diabetic',
       'diasolic_blood_pressure', 'education_level=college_or_above',
       'education_level=high_school',
       'education_level=less_than_high_school',
       'education_level=some_college_or_vocational_school',
       'gender=female', 'gender=male', 'glucose_level', 'had_a_stroke',
       'heart_rate', 'hypertensive', 'smoker=non_smoker', 'smoker=smoker',
       'systolic_blood_pressure', 'total_cholesterol'], dtype=object)

In [97]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [98]:
val_dict = df_val[categoricals + numericals].to_dict(orient='records')
X_val = dv.transform(val_dict)
X_val.shape

(848, 22)

In [99]:
y_pred = model.predict_proba(X_val)[:,1]
from sklearn.metrics import roc_auc_score
auc_val = roc_auc_score(y_val, y_pred)
round(auc_val, 3)

0.712