In [32]:
#Basic import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Modelling
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso,LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from  sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostRegressor
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
import xgboost as xgb

from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

## 1) Import the model training data as Pandas DataFrame

In [33]:
df = pd.read_csv('Data\\preprocessing.csv')
df.head()


Unnamed: 0,male,age,education,currentsmoker,cigsperday,bpmeds,prevalentstroke,prevalenthyp,diabetes,totchol,bmi,heartrate,glucose,tenyearchd
0,1,39,4.0,0,0.0,0,0,0,0,195.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0,0,0,0,250.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0,0,0,0,245.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0,0,1,0,225.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0,0,0,0,285.0,23.1,85.0,85.0,0


## 2)  Show top 5 records

In [34]:
df.head()

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bpmeds,prevalentstroke,prevalenthyp,diabetes,totchol,bmi,heartrate,glucose,tenyearchd
0,1,39,4.0,0,0.0,0,0,0,0,195.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0,0,0,0,250.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0,0,0,0,245.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0,0,1,0,225.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0,0,0,0,285.0,23.1,85.0,85.0,0


## 3 ) Model Training

### 3.1 ) Data Splitting

In [35]:
#splitting the dataset
from sklearn.model_selection import train_test_split
df_copy=df.copy()
df_full_train,df_test=train_test_split(df_copy,test_size=0.20,random_state=1)
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=1)

In [36]:
len(df),len(df_train),len(df_test),len(df_val)

(4240, 2544, 848, 848)

In [37]:
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
y_train=df_train.tenyearchd.values
y_test=df_test.tenyearchd.values
y_val=df_val.tenyearchd.values
del df_train['tenyearchd']
del df_test['tenyearchd']
del df_val['tenyearchd']

In [None]:
df.head()

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bpmeds,prevalentstroke,prevalenthyp,diabetes,totchol,bmi,heartrate,glucose,tenyearchd
0,1,39,4.0,0,0.0,0,0,0,0,195.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0,0,0,0,250.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0,0,0,0,245.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0,0,1,0,225.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0,0,0,0,285.0,23.1,85.0,85.0,0


In [28]:
numerical_columns=['age','education','cigsperday','totchol','bmi','heartrate','glucose']
categorical_columns=['male','currentsmoker','bpmeds','prevalentstroke','prevalenthyp','diabetes']

### 3.2) Training Logistic Regression

In [29]:
train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
dv=DictVectorizer(sparse=False)
X_train=dv.fit_transform(train_dicts)
X_val=dv.transform(val_dicts)

In [30]:
regression=LogisticRegression(solver='liblinear',class_weight='balanced',random_state=1, C=1.0, max_iter=1000)
regression.fit(X_train,y_train)

y_pred=regression.predict(X_val)
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
accuracy_score_model=accuracy_score(y_val,y_pred)
print(f'acc:{accuracy_score_model}')
auc=roc_auc_score(y_val,y_pred)
print(f'auc:{auc}')

acc:0.6651917404129793
auc:0.6750949767834529


#### 3.2.1 ) Tuning the Logistic Regression(K-Fold-Validation)

In [31]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

best_C = None
best_auc = 0

# Hyperparameter tuning
for C in [0.001, 0.01, 0.1, 1, 5, 10]:
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    scores = []
    
    for train_idx, val_idx in kfold.split(df_full_train, df_full_train['tenyearchd']):
        df_train = df_full_train.iloc[train_idx].copy()
        df_val = df_full_train.iloc[val_idx].copy()

        y_train = df_train.pop('tenyearchd')
        y_val = df_val.pop('tenyearchd')

        if len(y_val.unique()) < 2:
            print(f"Skipping fold for C={C} due to single-class validation data.")
            continue

        model = LogisticRegression(C=C, solver='liblinear', random_state=1, class_weight='balanced')
        model.fit(df_train, y_train)
        y_pred = model.predict_proba(df_val)[:, 1]
        
        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)

    if scores:  # Ensure there are valid scores
        mean_auc = np.mean(scores)
        print(f"C={C} AUC: {mean_auc:.3f} ± {np.std(scores):.3f}")
        
        if mean_auc > best_auc:
            best_auc = mean_auc
            best_C = C
    else:
        print(f"No valid scores for C={C}.")

print(f"Best C: {best_C}, Best AUC: {best_auc:.3f}")


C=0.001 AUC: 0.667 ± 0.016
C=0.01 AUC: 0.696 ± 0.019
C=0.1 AUC: 0.709 ± 0.024
C=1 AUC: 0.717 ± 0.025
C=5 AUC: 0.717 ± 0.026
C=10 AUC: 0.717 ± 0.026
Best C: 10, Best AUC: 0.717


### 3.3 ) Decision Tree


In [13]:

numerical_columns=['age','education','cigsperday','totchol','bmi','heartrate','glucose']
categorical_columns=['male','currentsmoker','bpmeds','prevalentstroke','prevalenthyp','diabetes']
dv=DictVectorizer(sparse=True)

train_dict=df_train[numerical_columns+categorical_columns].to_dict(orient='records')
X_train=dv.fit_transform(train_dict)

val_dict=df_val[numerical_columns+categorical_columns].to_dict(orient='records')
X_val=dv.transform(val_dict)

dt=DecisionTreeClassifier(random_state=1)
dt.fit(X_train,y_train)
y_pred_prob=dt.predict_proba(X_val)[:,1]
y_pred = (y_pred_prob >= 0.5).astype(int)
acc = round(accuracy_score(y_val, y_pred), 5)
print(f'acc:{acc}')
auc= round(roc_auc_score(y_val, y_pred), 5)
print(f'auc:{auc}')



acc:0.77434
auc:0.59379


#### 3.3.1) Tuning Decision Tree

In [14]:
mean_score=[]
for depth in [8,10,15,20,None]:
    for s in [1,2,5,10,15]:
        kfold=KFold(n_splits=5,random_state=1,shuffle=True)
        score=[]
        for train_idx,val_idx in kfold.split(df_full_train):
            df_train=df_full_train.iloc[train_idx]
            df_val=df_full_train.iloc[val_idx]
            y_train=df_train.tenyearchd.values
            y_val=df_val.tenyearchd.values
            
            del df_train['tenyearchd']
            del df_val['tenyearchd']
            
            dv=DictVectorizer(sparse=False)
            X_train=dv.fit_transform(df_train.to_dict(orient='records'))
            X_val=dv.transform(df_val.to_dict(orient='records'))
            
            dt=DecisionTreeClassifier(max_depth=depth,min_samples_leaf=s,random_state=1)
            dt.fit(X_train,y_train)
            y_pred=dt.predict_proba(df_val)[:,1]
            auc=roc_auc_score(y_val,y_pred)
            score.append(auc)
            mean_score.append((depth,s ,np.mean(score)))
        


In [15]:
columns=['max_depth','min_samples_leaf','auc']
df_scores=pd.DataFrame(mean_score,columns=columns)

In [16]:
df_scores=df_scores.sort_values(by=['auc'],ascending=False)
df_scores

Unnamed: 0,max_depth,min_samples_leaf,auc
1,8.0,1,0.500442
6,8.0,2,0.500442
26,10.0,1,0.500442
2,8.0,1,0.500294
7,8.0,2,0.500294
...,...,...,...
107,,2,0.499706
106,,2,0.499558
56,15.0,2,0.499558
31,10.0,2,0.499558


### 3.4 ) Random forest

In [17]:
from  sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
numerical_columns=['age','education','cigsperday','totchol','bmi','heartrate','glucose']
categorical_columns=['male','currentsmoker','bpmeds','prevalentstroke','prevalenthyp','diabetes']
dv=DictVectorizer(sparse=False)

train_dict=df_train[numerical_columns+categorical_columns].to_dict(orient='records')
X_train=dv.fit_transform(train_dict)
val_dict=df_val[numerical_columns+categorical_columns].to_dict(orient='records')
X_val=dv.transform(val_dict)
rf=RandomForestClassifier(n_estimators=5,random_state=1,n_jobs=-1)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_val)
acc=accuracy_score(y_val,y_pred)
print(f'acc:{acc}')
auc=roc_auc_score(y_val,y_pred)
print(f'auc:{auc}')

acc:0.8112094395280236
auc:0.5133586148909518


#### 3.4.1) Random Forest-Tuning


In [18]:
mean_score=[]
for depth in [8,10,15,20,None]:
     for s in [1,2,5,10,15]:
         for n in [5,10,15,20,30,60,100,150,200]:
             kfold=KFold(n_splits=5,shuffle=True,random_state=1)
             scores=[]
             for train_idx,val_idx in kfold.split(df_full_train):
                 df_train=df_full_train.iloc[train_idx]
                 df_val=df_full_train.iloc[val_idx]
                 
                 df_val=df_full_train.iloc[val_idx]
                 y_train=df_train.tenyearchd.values
                 y_val=df_val.tenyearchd.values
            
                 del df_train['tenyearchd']
                 del df_val['tenyearchd']
                 dv=DictVectorizer(sparse=False)
                 X_train=dv.fit_transform(df_train.to_dict(orient='records'))
                 X_val=dv.transform(df_val.to_dict(orient='records'))
                 rf=RandomForestClassifier(n_estimators=n,max_depth=depth,min_samples_leaf=s,random_state=1)
                 rf.fit(X_train,y_train)
                 
                 y_pred=rf.predict(X_val)
                 auc=roc_auc_score(y_val,y_pred)
                 print(f'auc:{auc}')
                 mean_score.append((depth,s,n,np.mean(score)))
                #  acc=accuracy_score(y_val,y_pred)
                #  print(f'acc:{acc}')

auc:0.5178414913700108
auc:0.5194893523875043
auc:0.5004873294346979
auc:0.5221476125980046
auc:0.524503663537931
auc:0.5150771979503775
auc:0.502665811939085
auc:0.5011208576998051
auc:0.5134583468457298
auc:0.5180688779449869
auc:0.5159452535059331
auc:0.5035492041652335
auc:0.49736842105263157
auc:0.5199252208084546
auc:0.506434785592944
auc:0.5185494201725998
auc:0.5088573751524438
auc:0.4956140350877193
auc:0.5160050361496348
auc:0.5023508416699556
auc:0.5273901024811218
auc:0.5123909440570374
auc:0.5066276803118909
auc:0.5056179775280899
auc:0.508167887845977
auc:0.5273901024811218
auc:0.5088495575221239
auc:0.5028752436647174
auc:0.5
auc:0.5099009900990099
auc:0.5097087378640777
auc:0.5088495575221239
auc:0.5037524366471735
auc:0.5
auc:0.504950495049505
auc:0.5097087378640777
auc:0.5132743362831859
auc:0.5046296296296297
auc:0.5
auc:0.504950495049505
auc:0.5097087378640777
auc:0.5132743362831859
auc:0.5037524366471735
auc:0.5
auc:0.504950495049505
auc:0.500514091154261
auc:0.516

### 3.5 ) Xgboost

In [20]:
import xgboost as xgb
numerical_columns=['age','education','cigsperday','totchol','bmi','heartrate','glucose']
categorical_columns=['male','currentsmoker','bpmeds','prevalentstroke','prevalenthyp','diabetes']
kfold=KFold(n_splits=5,shuffle=True,random_state=1)
for train_idx,val_idx in kfold.split(df_full_train):
    df_train=df_full_train.iloc[train_idx]
    df_val=df_full_train.iloc[val_idx]
    y_train=df_train.tenyearchd.values
    y_val=df_val.tenyearchd.values
                
    del df_train['tenyearchd']
    del df_val['tenyearchd']
                
    dv=DictVectorizer(sparse=False)
    X_train=dv.fit_transform(df_train.to_dict(orient='records'))
    X_val=dv.transform(df_val.to_dict(orient='records'))
    
    dtrain=xgb.DMatrix(X_train,label=y_train)
    dval=xgb.DMatrix(X_val,label=y_val)
    xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'nthread': -1,
    'seed': 1,
    }
    
    xgb_model=xgb.train(xgb_params,dtrain)
    y_pred=xgb_model.predict(dval)
    y_pred_binary = (y_pred >= 0.5).astype(int)
    auc=roc_auc_score(y_val,y_pred_binary)
    acc=accuracy_score(y_val,y_pred_binary)
    print(f'auc:{auc}')
    print(f'acc:{acc}')

auc:0.5310224514563107
acc:0.8468335787923417
auc:0.5256730979705432
acc:0.8350515463917526
auc:0.5207602339181286
acc:0.8377581120943953
auc:0.5204498197287346
acc:0.8628318584070797
auc:0.5242548518283371
acc:0.8436578171091446


#### 3.5.1) Xgboost Tuning

In [21]:
from sklearn.model_selection import ParameterGrid
param_grid={
    'eta':[0.01,0.1,0.2,0.3],
    'max_depth':[3,5,7],
    'min_child_weight':[1,3,5],
    'subsample':[0.8,0.9,1.0],
    'colsample_bytree':[0.3,0.6,0.9],
}
numerical_columns=['age','education','cigsperday','totchol','bmi','heartrate','glucose']
categorical_columns=['male','currentsmoker','bpmeds','prevalentstroke','prevalenthyp','diabetes']

kfold=KFold(n_splits=5,shuffle=True,random_state=1)
best_auc = 0
best_params = None
for param in ParameterGrid(param_grid):
    fold_aucs = []
    for train_idx,val_idx in kfold.split(df_full_train):
        df_train=df_full_train.iloc[train_idx]
        df_val=df_full_train.iloc[val_idx]
        
        y_train=df_train.tenyearchd.values
    
        y_val=df_val.tenyearchd.values
        
        del df_train['tenyearchd']
    
        del df_val['tenyearchd']
        
        dv=DictVectorizer(sparse=False)
        X_train=dv.fit_transform(df_train.to_dict(orient='records'))
        X_val=dv.transform(df_val.to_dict(orient='records'))
        
        dtrain=xgb.DMatrix(X_train,label=y_train)
        dval=xgb.DMatrix(X_val,label=y_val)
        
        model=xgb.train(param,dtrain,num_boost_round=100)
    
        
        y_pred=model.predict(dval)
        auc=roc_auc_score(y_val,y_pred)
        fold_aucs.append(auc)

In [22]:
avg_auc=sum(fold_aucs)/len(fold_aucs)
print(f"Params: {param}, AUC: {avg_auc}")

Params: {'colsample_bytree': 0.9, 'eta': 0.3, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 1.0}, AUC: 0.6296921351155738


In [23]:
if avg_auc>best_auc:
    best_auc=avg_auc
    best_params=param
print("Best Parameters:", best_params)
print("Best AUC:", best_auc)    

Best Parameters: {'colsample_bytree': 0.9, 'eta': 0.3, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 1.0}
Best AUC: 0.6296921351155738
