In [89]:
import pandas as pd
import numpy as np

# Load the dataset (update path if needed)
df = pd.read_csv('heart.csv')

# Show the first 5 rows to understand data structure
print(df.head())

# Print column names to know exact attribute names (important for next steps)
print("Columns in dataset:")
print(df.columns)


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  
Columns in dataset:
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


In [90]:
# Select numerical columns
numerical_cols = df.select_dtypes(include='number').columns

# Calculate variance and std deviation
variance = df[numerical_cols].var()
std_dev = df[numerical_cols].std()

print("\nVariance of numerical attributes:")
print(variance)

print("\nStandard deviation of numerical attributes:")
print(std_dev)



Variance of numerical attributes:
age           82.484558
sex            0.217166
cp             1.065132
trestbps     307.586453
chol        2686.426748
fbs            0.126877
restecg        0.276528
thalach      524.646406
exang          0.220707
oldpeak        1.348095
slope          0.379735
ca             1.045724
thal           0.374883
target         0.248836
dtype: float64

Standard deviation of numerical attributes:
age          9.082101
sex          0.466011
cp           1.032052
trestbps    17.538143
chol        51.830751
fbs          0.356198
restecg      0.525860
thalach     22.905161
exang        0.469794
oldpeak      1.161075
slope        0.616226
ca           1.022606
thal         0.612277
target       0.498835
dtype: float64


In [91]:
# Covariance matrix
cov_matrix = df[numerical_cols].cov()
print("\nCovariance matrix:")
print(cov_matrix)

# Correlation matrix
corr_matrix = df[numerical_cols].corr()
print("\nCorrelation matrix:")
print(corr_matrix)



Covariance matrix:
                 age       sex        cp    trestbps         chol       fbs  \
age        82.484558 -0.416661 -0.643499   44.495902   100.585076  0.392433   
sex        -0.416661  0.217166 -0.023736   -0.463970    -4.780309  0.007475   
cp         -0.643499 -0.023736  1.065132    0.861714    -4.113774  0.034719   
trestbps   44.495902 -0.463970  0.861714  307.586453   111.967215  1.109042   
chol      100.585076 -4.780309 -4.113774  111.967215  2686.426748  0.245427   
fbs         0.392433  0.007475  0.034719    1.109042     0.245427  0.126877   
restecg    -0.555013 -0.014261  0.024108   -1.052324    -4.116703 -0.015769   
thalach   -82.903318 -0.469871  6.991618  -18.759131   -11.800494 -0.069897   
exang       0.413022  0.031014 -0.191168    0.557111     1.631991  0.004295   
oldpeak     2.214583  0.051993 -0.178821    3.934486     3.246794  0.002377   
slope      -0.944791 -0.008819  0.076137   -1.312832    -0.128964 -0.013147   
ca          2.566356  0.056357 -

In [92]:
threshold = 0.8
high_corr_pairs = []

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i,j]
        if abs(corr_val) > threshold:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))

print("\nHighly correlated feature pairs (|corr| > 0.8):")
for pair in high_corr_pairs:
    print(pair)



Highly correlated feature pairs (|corr| > 0.8):


In [93]:
low_variance_features = variance[variance < 0.01].index.tolist()
print("\nLow variance (near-constant) features:")
print(low_variance_features)



Low variance (near-constant) features:
[]


In [94]:
bins = 4
df['age_binned'] = pd.qcut(df['age'], q=bins, labels=False)

print("\nAge discretization sample:")
print(df[['age', 'age_binned']].head(10))



Age discretization sample:
   age  age_binned
0   63           3
1   37           0
2   41           0
3   56           2
4   57           2
5   57           2
6   56           2
7   44           0
8   52           1
9   57           2


In [95]:
# Define normalization functions

def min_max_norm(col):
    return (col - col.min()) / (col.max() - col.min())

def z_score_norm(col):
    return (col - col.mean()) / col.std()

def decimal_scaling_norm(col):
    max_abs = max(abs(col.min()), abs(col.max()))
    j = len(str(int(max_abs)))
    return col / (10 ** j)

# Use correct attribute names from dataset (update if needed)
attributes = ['trestbps', 'chol', 'thalach']

for attr in attributes:
    df[f'{attr}_minmax'] = min_max_norm(df[attr])
    df[f'{attr}_zscore'] = z_score_norm(df[attr])
    df[f'{attr}_decimal'] = decimal_scaling_norm(df[attr])

print("\nNormalized values sample:")
print(df[[*attributes, *[f'{a}_minmax' for a in attributes], *[f'{a}_zscore' for a in attributes], *[f'{a}_decimal' for a in attributes]]].head())



Normalized values sample:
   trestbps  chol  thalach  trestbps_minmax  chol_minmax  thalach_minmax  \
0       145   233      150         0.481132     0.244292        0.603053   
1       130   250      187         0.339623     0.283105        0.885496   
2       130   204      172         0.339623     0.178082        0.770992   
3       120   236      178         0.245283     0.251142        0.816794   
4       120   354      163         0.245283     0.520548        0.702290   

   trestbps_zscore  chol_zscore  thalach_zscore  trestbps_decimal  \
0         0.762694    -0.255910        0.015417             0.145   
1        -0.092585     0.072080        1.630774             0.130   
2        -0.092585    -0.815424        0.975900             0.130   
3        -0.662770    -0.198030        1.237849             0.120   
4        -0.662770     2.078611        0.582975             0.120   

   chol_decimal  thalach_decimal  
0         0.233            0.150  
1         0.250            0.18