In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import time

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [4]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(723)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,68,0,2,120,211,0,0,115,0,1.5,1,0,2,1
733,44,0,2,108,141,0,1,175,0,0.6,1,0,2,1
739,52,1,0,128,255,0,1,161,1,0.0,2,1,3,0
843,59,1,3,160,273,0,0,125,0,0.0,2,0,2,0


In [8]:
df['age'].unique()
df['sex'].unique()
df['cp'].unique()
df['trestbps'].unique()
df['chol'].unique()
df['fbs'].unique()
df['restecg'].unique()
df['thalach'].unique()
df['exang'].unique()    
df['oldpeak'].unique()
df['slope'].unique()
df['ca'].unique()
df['thal'].unique()
df['target'].unique()

array([0, 1])

## Raw Data Training

In [9]:
X_raw = df.drop('target', axis=1)
y_raw = df['target']

In [10]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42)

In [11]:
start_time = time.time()
model_raw = LogisticRegression(random_state=42, max_iter=1000)
model_raw.fit(X_train_raw, y_train_raw)
training_time_raw = time.time() - start_time

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
y_pred_raw = model_raw.predict(X_test_raw)

In [13]:
accuracy_raw = accuracy_score(y_test_raw, y_pred_raw)
precision_raw = precision_score(y_test_raw, y_pred_raw)
recall_raw = recall_score(y_test_raw, y_pred_raw)
f1_raw = f1_score(y_test_raw, y_pred_raw)

In [14]:
print("Raw Data Results:")
print(f"Training Time: {training_time_raw:.4f} seconds")
print(f"Accuracy: {accuracy_raw:.4f}")
print(f"Precision: {precision_raw:.4f}")
print(f"Recall: {recall_raw:.4f}")
print(f"F1 Score: {f1_raw:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_raw, y_pred_raw))

Raw Data Results:
Training Time: 0.7220 seconds
Accuracy: 0.8033
Precision: 0.7429
Recall: 0.8966
F1 Score: 0.8125

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.72      0.79        32
           1       0.74      0.90      0.81        29

    accuracy                           0.80        61
   macro avg       0.81      0.81      0.80        61
weighted avg       0.82      0.80      0.80        61



## Normalized Data Training

In [15]:
scaler_norm = MinMaxScaler()
X_normalized = scaler_norm.fit_transform(X_raw)
X_normalized = pd.DataFrame(X_normalized, columns=X_raw.columns)

In [16]:
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_normalized, y_raw, test_size=0.2, random_state=42)

In [17]:
start_time = time.time()
model_norm = LogisticRegression(random_state=42, max_iter=1000)
model_norm.fit(X_train_norm, y_train_norm)
training_time_norm = time.time() - start_time

In [18]:
y_pred_norm = model_norm.predict(X_test_norm)

In [19]:
accuracy_norm = accuracy_score(y_test_norm, y_pred_norm)
precision_norm = precision_score(y_test_norm, y_pred_norm)
recall_norm = recall_score(y_test_norm, y_pred_norm)
f1_norm = f1_score(y_test_norm, y_pred_norm)

In [20]:
print("Normalized Data Results:")
print(f"Training Time: {training_time_norm:.4f} seconds")
print(f"Accuracy: {accuracy_norm:.4f}")
print(f"Precision: {precision_norm:.4f}")
print(f"Recall: {recall_norm:.4f}")
print(f"F1 Score: {f1_norm:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_norm, y_pred_norm))

Normalized Data Results:
Training Time: 0.0127 seconds
Accuracy: 0.8033
Precision: 0.7429
Recall: 0.8966
F1 Score: 0.8125

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.72      0.79        32
           1       0.74      0.90      0.81        29

    accuracy                           0.80        61
   macro avg       0.81      0.81      0.80        61
weighted avg       0.82      0.80      0.80        61



## Standardized Data Training

In [21]:
scaler_std = StandardScaler()
X_standardized = scaler_std.fit_transform(X_raw)
X_standardized = pd.DataFrame(X_standardized, columns=X_raw.columns)

In [22]:
X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(X_standardized, y_raw, test_size=0.2, random_state=42)

In [23]:
start_time = time.time()
model_std = LogisticRegression(random_state=42, max_iter=1000)
model_std.fit(X_train_std, y_train_std)
training_time_std = time.time() - start_time

In [24]:
y_pred_std = model_std.predict(X_test_std)

In [25]:
accuracy_std = accuracy_score(y_test_std, y_pred_std)
precision_std = precision_score(y_test_std, y_pred_std)
recall_std = recall_score(y_test_std, y_pred_std)
f1_std = f1_score(y_test_std, y_pred_std)

In [26]:
print("Standardized Data Results:")
print(f"Training Time: {training_time_std:.4f} seconds")
print(f"Accuracy: {accuracy_std:.4f}")
print(f"Precision: {precision_std:.4f}")
print(f"Recall: {recall_std:.4f}")
print(f"F1 Score: {f1_std:.4f}")

Standardized Data Results:
Training Time: 0.0069 seconds
Accuracy: 0.7705
Precision: 0.7027
Recall: 0.8966
F1 Score: 0.7879


## Comparison of Preprocessing Techniques

Compare the performance metrics and training times across raw, normalized, and standardized data:

In [27]:
import pandas as pd

comparison_data = {
    'Preprocessing': ['Raw Data', 'Normalized Data', 'Standardized Data'],
    'Training Time (s)': [training_time_raw, training_time_norm, training_time_std],
    'Accuracy': [accuracy_raw, accuracy_norm, accuracy_std],
    'Precision': [precision_raw, precision_norm, precision_std],
    'Recall': [recall_raw, recall_norm, recall_std],
    'F1 Score': [f1_raw, f1_norm, f1_std]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

    Preprocessing  Training Time (s)  Accuracy  Precision   Recall  F1 Score
         Raw Data           0.722019  0.803279   0.742857 0.896552  0.812500
  Normalized Data           0.012664  0.803279   0.742857 0.896552  0.812500
Standardized Data           0.006912  0.770492   0.702703 0.896552  0.787879


## Insights and Questions

### How does training change if I don't scale features at all?
Without scaling, big numbers will ruin the training which then ruins everything.

### Which scaling method (MinMaxScaler vs StandardScaler) works better on this dataset?
StandardScaler since our number are too small already.

### Do categorical features need to be one-hot encoded, and how does that affect performance?
Yes, we dont want the the computer to treat 2 bigger than 1 since thats not what they mean.

### How sensitive is the neural network to changes in learning rate when features are scaled vs unscaled?
too sensitive and the learning rate is much faster.