**Imports and data preprocessing**

In [28]:
#Import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

myNames = ["age", # column 1
        "sex", # col 2
        "chestPainType", # col 3
        "restingBP", # col 4
        "serumCholesterol", # col 5
        "fastingBloodSugar", # col 6
        "restingEcg", # col 7
        "maxHeartRate", # col 8
        "exerciseInducedAngina", # col 9
        "stDepression", # col 10
        "stSlope", # col 11
        "majorVessels", # col 12
        "thalassemia", # col 13
        "diagnosis"] # col 14
assert len(myNames) == 14


data=pd.read_csv("data/processed.cleveland.data", sep=',', names=myNames, na_values=["?", '?'])
data = data.dropna() # Drop rows with missing data
data.iloc[:, -1] = data.iloc[:, -1].clip(0, 1) # make last column is 0's and 1's

# Convert all columns to float
for name in myNames:
        data[name] = pd.to_numeric(data[name],errors = 'coerce')
        
num_ones = data.iloc[:, -1].value_counts()[1]
print("len(y==1): ", num_ones)
num_zeros = data.iloc[:, -1].value_counts()[0]
print("len(y==0):",  num_zeros)

len(y==1):  137
len(y==0): 160


Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler

    
X_train, X_test, y_train, y_test = train_test_split(data.drop("diagnosis", axis=1), data["diagnosis"], test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)   

def train(argPenalty=None, argC=[0.01, 0.05, 0.1, 0.3, 0.5,  0.8, 1, 2, 5]):
    if not argPenalty:
        model = LogisticRegression(max_iter=10000)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        print("Log Reg Unregularized")
        print("No penalty")
        print("Accuracy (% correct predicts)", round(accuracy, 2))
    else:
        bestAcc = 0
        bestC = None
        for myC in argC:
            model = LogisticRegression(max_iter=10000, penalty=argPenalty, C=myC)
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            accuracy = accuracy_score(y_test, y_pred)
            if accuracy > bestAcc:
                bestAcc = accuracy
                bestC = myC
        print("Log Reg Regularized")
        print("Penalty " + argPenalty)
        print("Best C", bestC)
        print("Accuracy (% correct predicts)", round(bestAcc,2))

    """
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    print("Balanced accuracy:", round(balanced_accuracy,2))
    print("Precision:", round(precision, 2))
    print("Recall:", round(recall, 2))
    print("F1-score:", round(f1, 2))
    """
    
train()
print()
train("l2")

Log Reg Unregularized
No penalty
Accuracy (% correct predicts) 0.87

Log Reg Regularized
Penalty l2
Best C 0.05
Accuracy (% correct predicts) 0.9


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report

# Encode the categorical variables
# It transforms each unique value of the categorical variable into a separate binary variable.
# Multiple columns of True/False values
data_encoded = pd.get_dummies(data)

# Split the data into training and testing sets
X = data_encoded.drop('diagnosis', axis=1)
y = data_encoded['diagnosis']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

# Fit a logistic regression model to the training set
# increase the maximum number of iterations and specify the solver
model = LogisticRegression(max_iter=10000, solver='lbfgs')
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluating the performance of the model
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and testing sets
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

# Create decision tree classifier with default settings
dt = DecisionTreeClassifier(random_state=60)

# Fit the decision tree on the training data
dt.fit(X_train, y_train)

# Predict on the testing data
y_pred = dt.predict(X_test)

# Evaluate the model performance
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Convert categorical variables to dummies
data_encoded = pd.get_dummies(data, columns=["chestPainType", "restingEcg", "stSlope", "thalassemia"])

# Split the data into training and testing sets
X = data_encoded.drop('diagnosis', axis=1)
y = data_encoded['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=60)
rf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf.predict(X_test)
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report

# Encode the categorical variables
data_encoded = pd.get_dummies(data)

# Split the data into training and testing sets
X = data_encoded.drop('diagnosis', axis=1)
y = data_encoded['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

# Fit a GBM model to the training set
model = GradientBoostingClassifier(random_state=60, learning_rate=0.1, n_estimators=100)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluating the performance of the model
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#import tensorflow as tf
#from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

# Split the data into features and target
X = data.drop("diagnosis", axis=1)
y = data["diagnosis"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network
model = Sequential()
model.add(Dense(units=32, activation="relu", input_dim=X_train.shape[1]))
model.add(Dense(units=16, activation="relu"))
model.add(Dense(units=1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the neural network
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the neural network
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")