In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn import neighbors
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
url = "https://raw.githubusercontent.com/PratheepaJ/datasets/refs/heads/master/ass6-dataset.csv"
df = pd.read_csv(url)

In [5]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


### Question 1    
According to the dataset description, it includes 303 observations with 13 features, including age, sex, chest pain type, resting blood pressure, cholesterol levels, fasting blood sugar, electrocardiographic results, and others. The target value is num, we should transform it to a binary with value 0-4 to label:

`1`: presence of heart disease(1-4)

`0`: no heart disease(0) 


### Question 2

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB


In [None]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB


In [11]:
continuous_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    float64
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(7), int64(7)
memory usage: 33.3 KB


### Question 3¶

In [13]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0.948726,1,1,0.757525,-0.2649,1,2,0.017197,0,1.087338,3,0.0,6.0,0
1,1.392002,1,4,1.61122,0.760415,0,2,-1.821905,1,0.397182,2,3.0,3.0,2
2,1.392002,1,4,-0.6653,-0.342283,0,2,-0.902354,1,1.346147,2,2.0,7.0,1
3,-1.932564,1,3,-0.09617,0.063974,0,0,1.637359,0,2.122573,3,0.0,3.0,0
4,-1.489288,0,2,-0.09617,-0.825922,0,2,0.980537,0,0.310912,1,0.0,3.0,0


In [14]:
df.shape

(303, 14)

In [15]:
df_stats = df.describe()
print(df_stats)

                age         sex          cp      trestbps          chol  \
count  3.030000e+02  303.000000  303.000000  3.030000e+02  3.030000e+02   
mean  -1.465641e-18    0.679868    3.158416  4.426236e-16  2.345026e-16   
std    1.001654e+00    0.467299    0.960126  1.001654e+00  1.001654e+00   
min   -2.819115e+00    0.000000    1.000000 -2.145037e+00 -2.334877e+00   
25%   -7.135564e-01    0.000000    3.000000 -6.652997e-01 -6.905030e-01   
50%    1.729945e-01    1.000000    3.000000 -9.616980e-02 -1.101357e-01   
75%    7.270888e-01    1.000000    4.000000  4.729601e-01  5.476139e-01   
max    2.500191e+00    1.000000    4.000000  3.887739e+00  6.138485e+00   

              fbs     restecg       thalach       exang       oldpeak  \
count  303.000000  303.000000  3.030000e+02  303.000000  3.030000e+02   
mean     0.148515    0.990099 -1.172513e-16    0.326733  2.345026e-17   
std      0.356198    0.994971  1.001654e+00    0.469794  1.001654e+00   
min      0.000000    0.000000 -3

In [16]:
missing_values = df.isnull().sum()
missing_values

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

The dataset is from the UCI Heart Disease Database and it includes 303 observations with 13 features and 1 target variable (num), which shows whether the patient has heart disease. 

There are missing values in variables `ca` (4 missing) and `thal` (2 missing).

The target variable `num` is in range 0-4, need to transform to binary. 

| Variable Name | Data Type             | Description                                                      |
|---------------|-----------------------|------------------------------------------------------------------|
| age           | Continuous (Integer -> sacled)  | Age (years)                                                      |
| sex           | Categorical           | Gender (1 = male; 0 = female)                                             |
| cp            | Categorical           | Chest pain type  (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain 4 = asymptomatic)                                                |
| trestbps      | Continuous (Integer -> sacled)  | resting blood pressure (in mm Hg on admission to the hospital)                                   |
| chol          | Continuous (Integer -> sacled)  | serum cholestoral in mg/dl                                       |
| fbs           | Categorical           | Fasting blood sugar > 120 mg/dl (1 = true; 0 = false)                        |
| restecg       | Categorical           | 0 = normal; 1 = having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV); 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria                             |
| thalach       | Continuous (Integer -> sacled)  | Maximum heart rate achieved                                      |
| exang         | Categorical           | Exercise-induced angina (1 = yes; 0 = no)                                 |
| oldpeak       | Continuous            | ST depression induced by exercise relative to rest                |
| slope         | Categorical           | Slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)                           |
| ca            | Integer (with missing values)  -> sacled | Number of major vessels (0-3) colored by fluoroscopy     |
| thal          | Categorical (with missing values) | Thalassemia type (3 = normal; 6 = fixed defect; 7 = reversable defect)                                       |
| num           | Integer               | Diagnosis of heart disease (0: no disease, 1-4: varying severity)|


### Question 4

In [17]:
df['num'] = df['num'].apply(lambda x: 1 if x >0 else 0)
print(df['num'].value_counts())
df.head()

num
0    164
1    139
Name: count, dtype: int64


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0.948726,1,1,0.757525,-0.2649,1,2,0.017197,0,1.087338,3,0.0,6.0,0
1,1.392002,1,4,1.61122,0.760415,0,2,-1.821905,1,0.397182,2,3.0,3.0,1
2,1.392002,1,4,-0.6653,-0.342283,0,2,-0.902354,1,1.346147,2,2.0,7.0,1
3,-1.932564,1,3,-0.09617,0.063974,0,0,1.637359,0,2.122573,3,0.0,3.0,0
4,-1.489288,0,2,-0.09617,-0.825922,0,2,0.980537,0,0.310912,1,0.0,3.0,0
