In [63]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [75]:
#!python -m pip freeze > requirements.txt

# Import Data

In [76]:
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
HEADERS = ['age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df = pd.read_csv(URL, names = HEADERS)
train_test_df, val_df = train_test_split(df, test_size=0.1, stratify=df.target, random_state=1234)

# Preprocess Data

In [77]:
def preprocessing(preproc_df):
    bin_cols = {'thal': 3.0, 'cp': 4.0, 'restecg':0.0}
    cat_cols = ['slope', 'ca']
    
    preproc_df = preproc_df.apply(lambda x: x.replace(['?'], np.nan), axis=1)
    preproc_df.dropna()
    
    for col in cat_cols:
        new_cols = pd.get_dummies(preproc_df[col], prefix=col)
        preproc_df = pd.concat([preproc_df, new_cols], axis=1)
        preproc_df = preproc_df.drop(col, axis=1)
        
    for col in bin_cols.keys():
        preproc_df[col] = preproc_df[col].apply(lambda x: 0 if float(x) == bin_cols[col] else 1)
    
    preproc_df.target = preproc_df.target.apply(lambda x: 0 if x == 0 else 1)
    return preproc_df

In [78]:
train_test_df = preprocessing(train_test_df)
val_df = preprocessing(val_df)

# Create Dataset

In [79]:
def create_set(set_df, minutes):         
    X = set_df[get_feat_list(minutes)]
    y = set_df.target

    return X, y

In [80]:
def get_feat_list(minutes):
    times_available = time_constraints.keys()
    feat_list = []
    
    for limit in times_available:
        if limit <= minutes: feat_list += time_constraints[limit]
    
    return feat_list

# Build Model

In [81]:
def build_model(X_train, X_test, y_train, y_test, X_val, y_val, minutes):
    model = LogisticRegression(C=0.5, solver='liblinear')
    model.fit(X_train, y_train)
    
    importance = model.coef_[0]
    import_by_featname = sorted(list(zip(X_train.columns, importance)), key=lambda x: x[1])
    
    print('----Test----')
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions, target_names=['absense', 'presence']))
    
    print('----Validation----')
    predictions = model.predict(X_val)
    print(classification_report(y_val, predictions, target_names=['absense', 'presence']))
    
    pickle.dump(model, open(f'models/model.{minutes}.sav', 'wb'))
    
    return import_by_featname

In [82]:
def save_feat_importance(importance, minutes):
    features = [x[0] for x in importance]
    importance = [x[1] for x in importance]

    plt.bar(features, importance)
    plt.rcParams["figure.figsize"] = (15,5)
    plt.title(f'Feature importance for diagnostic tests under {minutes} minutes')
    plt.savefig(f'images/feature_importance_<{minutes}.png', dpi=100)
    plt.clf()

# Driver

In [83]:
time_constraints = {
    60:['age', 'sex', 'cp', 'restbp', 'chol', 'restecg', 'thalach', 'thal'], 
    120:['ca_0.0', 'ca_1.0', 'ca_2.0', 'ca_3.0'], 
    600:['fbs'], 
    720:['exang', 'oldpeak', 'slope_1.0', 'slope_2.0', 'slope_3.0']
}

In [84]:
for time_limit in time_constraints.keys():
    print(f'-----Building model for diagnostic tools that take under {time_limit} minutes to perform----')
    
    X, y = create_set(set_df=train_test_df, minutes=time_limit)
    X_val, y_val = create_set(set_df=val_df, minutes=time_limit)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=train_test_df.target, random_state=1234)
    
    feature_weights = build_model(X_train, X_test, y_train, y_test, X_val, y_val, time_limit)
    save_feat_importance(feature_weights, time_limit)
    

-----Building model for diagnostic tools that take under 60 minutes to perform----
----Test----
              precision    recall  f1-score   support

     absense       0.83      0.83      0.83        30
    presence       0.80      0.80      0.80        25

    accuracy                           0.82        55
   macro avg       0.82      0.82      0.82        55
weighted avg       0.82      0.82      0.82        55

----Validation----
              precision    recall  f1-score   support

     absense       0.78      0.82      0.80        17
    presence       0.77      0.71      0.74        14

    accuracy                           0.77        31
   macro avg       0.77      0.77      0.77        31
weighted avg       0.77      0.77      0.77        31

-----Building model for diagnostic tools that take under 120 minutes to perform----
----Test----
              precision    recall  f1-score   support

     absense       0.86      0.83      0.85        30
    presence       0.81  

<Figure size 1080x360 with 0 Axes>