In [3]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setting-Up Random seed
random_seed = 123
np.random.seed(random_seed)

In [None]:
# Load the dataset
df = pd.read_csv("aami2_rr.csv")
print(f"Number of rows: {df.shape[0]} and Number of Columns: {df.shape[1]}")
print(df.head())

In [None]:
# Unique Values
print(df['annotations'].value_counts())

# Creating binary label: 0: Normal Heart Beat, 1: Abnormal/Anything except normal
df['label'] = df['annotations'].apply(lambda x: 0 if x == 'N' else 1)

# Label Distribution
label_count  = df['label'].value_counts().sort_index()
print(f"Normal (0):   {label_count[0]:,} samples ({label_count[0]/len(df)*100:.2f}%)")
print(f"Abnormal (1): {label_count[1]:,} samples ({label_count[1]/len(df)*100:.2f}%)")

# Contamination rate (For isolation forest)
contamination_rate = label_count[1] / len(df)
print(f"Contamination ({contamination_rate*100:.2f}%)")

The contamination parameter informs Isolation Forest's threshold for classifying anomalies based on the known proportion of abnormal samples in the training data.

In [None]:
# Identify feature columns 
feature_columns = [col for col in df.columns if col.startswith('X')]
n_features = len(feature_columns)
print(f"Number of features: {n_features}")

# check for missing values
missing = df[feature_columns].isnull().sum().sum()
print(f"Number of missing values: {missing}")

# Range of features (Important for knowing if we need scaling)
print(f"Minimum: {df[feature_columns].min().min():.4f}")
print(f"Maximum: {df[feature_columns].max().max():.4f}")

print("Variance per feature (first 10):")
print(df[feature_columns[:10]].var())

In [None]:

plt.figure(figsize=(12, 4))

# Plot 1: Sample normal heartbeat
plt.subplot(1, 2, 1)
normal_sample = df[df['label']==0][feature_columns].iloc[0]
plt.plot(normal_sample, color='green', linewidth=1.5, alpha=0.7)
plt.title('Normal Heartbeat Pattern (Sample)', fontweight='bold')
plt.xlabel('Feature Index (X1 to X150)')
plt.ylabel('R-R Interval Value')
plt.grid(True, alpha=0.3)

# Plot 2: Sample abnormal heartbeat  
plt.subplot(1, 2, 2)
abnormal_sample = df[df['label']==1][feature_columns].iloc[0]
plt.plot(abnormal_sample, color='red', linewidth=1.5, alpha=0.7)
plt.title('Abnormal Heartbeat Pattern (Sample)', fontweight='bold')
plt.xlabel('Feature Index (X1 to X150)')
plt.ylabel('R-R Interval Value')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Let's create a summary
print("="*70)
print("SUMMARY - DATA EXPLORATION")
print("="*70)
print(f"\n✓ Total Samples: {len(df):,}")
print(f"✓ Features (R-R intervals): {n_features}")
print(f"✓ Normal heartbeats: {label_count[0]:,} ({label_count[0]/len(df)*100:.1f}%)")
print(f"✓ Abnormal heartbeats: {label_count[1]:,} ({label_count[1]/len(df)*100:.1f}%)")
print(f"✓ Contamination rate: {contamination_rate:.4f}")
print(f"✓ Feature ranges: {df[feature_columns].min().min():.2f} to {df[feature_columns].max().max():.2f}")
print(f"✓ Missing values: {missing}")
print(f"\n⚠ Key insight: Features have different variances → Need standardization!")
print(f"✓ Visual patterns differ between normal and abnormal")
print("\n" + "="*70)

# Step 2: Data Processing and Feature Scaling

## Objectives:
1. Split data into training and testing sets
2. Stadardize features (Zero mean, unit variance)
3. Understand why we fit scaler on training data only 
4. Prepare data for Random Forest and Isolation Forest 
5. Save process data for model training

In [22]:
# Seperate features (X) and labels (y)
X = df[feature_columns].values
y = df['label'].values

print(f"Feature (X) shape: {X.shape}")
print(f"Label (y) shape: {y.shape}")

Feature (X) shape: (88788, 150)
Label (y) shape: (88788,)


In [30]:
from sklearn.model_selection import train_test_split 

# Split data: 50% train, 50% test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = random_seed, stratify = y)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")




Training set: (44394, 150)
Test set: (44394, 150)


In [33]:
from sklearn.preprocessing import StandardScaler 

# Scaler object
scaler = StandardScaler()

# Fit the scaler only on the training data
scaler.fit(X_train)

# Transform training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Original training data:")
print(f"  Mean: {X_train.mean():.4f}")
print(f"  Std:  {X_train.std():.4f}")

print(f"\nStandardized training data:")
print(f"  Mean: {X_train_scaled.mean():.4f}")
print(f"  Std:  {X_train_scaled.std():.4f}")

print(f"\nStandardized test data:")
print(f"  Mean: {X_test_scaled.mean():.4f}")
print(f"  Std:  {X_test_scaled.std():.4f}")


Original training data:
  Mean: 0.0097
  Std:  0.3468

Standardized training data:
  Mean: 0.0000
  Std:  1.0000

Standardized test data:
  Mean: 0.0016
  Std:  1.0205


We Fit the scaler exclusively on training data to prevent information leakage, which is why test data statistics deviate slightly from 0 and 1.

In [35]:
import pickle

# Save preprocessed data for model training 
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_train':y_train,
    'X_train_scaled': X_train_scaled,
    'X_test_scaled': X_test_scaled,
    'contamination_rate': contamination_rate,
    'feature_cols': feature_columns,
    'random_seed': random_seed
}

# Save to file
with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)