# Heart Disease Data Preprocessing and Cleaning

This notebook covers the initial data preprocessing steps for the Heart Disease UCI dataset including:
- Data loading and exploration
- Missing value handling
- Data encoding for categorical variables
- Feature scaling
- Exploratory Data Analysis (EDA)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the real Heart Disease UCI datasetprint("Loading Heart Disease UCI Dataset...")# Load the datasetdf = pd.read_csv('data/heart_disease_uci.csv')print(f"Original dataset shape: {df.shape}")print(f"Original columns: {list(df.columns)}")# Data preprocessing for the real datasetprint("\nPreprocessing the real dataset...")# Create a copy for preprocessingdf_processed = df.copy()# Handle categorical variables# Convert sex to binary (Male=1, Female=0)df_processed['sex'] = df_processed['sex'].map({'Male': 1, 'Female': 0})# Convert chest pain type to numericcp_mapping = {    'typical angina': 0,    'atypical angina': 1,    'non-anginal': 2,    'asymptomatic': 3}df_processed['cp'] = df_processed['cp'].map(cp_mapping)# Convert fasting blood sugar to binarydf_processed['fbs'] = df_processed['fbs'].astype(int)# Convert resting ECG to numericrestecg_mapping = {    'normal': 0,    'lv hypertrophy': 1,    'st-t abnormality': 2}df_processed['restecg'] = df_processed['restecg'].map(restecg_mapping)# Convert exercise induced angina to binarydf_processed['exang'] = df_processed['exang'].astype(int)# Convert ST slope to numericslope_mapping = {    'upsloping': 0,    'flat': 1,    'downsloping': 2}df_processed['slope'] = df_processed['slope'].map(slope_mapping)# Convert thalassemia to numericthal_mapping = {    'normal': 1,    'fixed defect': 2,    'reversable defect': 3}df_processed['thal'] = df_processed['thal'].map(thal_mapping)# Use 'num' column as target (0=no disease, >0=disease)# Convert to binary: 0=no disease, 1=diseasedf_processed['target'] = (df_processed['num'] > 0).astype(int)# Select relevant features (exclude id, dataset, and target columns)feature_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',                    'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']# Create final dataset with selected features and targetdf_final = df_processed[feature_columns + ['target']].copy()# Handle missing values (replace with median for numerical columns)for col in df_final.columns:    if df_final[col].dtype in ['int64', 'float64']:        df_final[col] = df_final[col].fillna(df_final[col].median())# Save the processed datasetdf_final.to_csv('data/heart_disease.csv', index=False)print(f"\nProcessed dataset shape: {df_final.shape}")print(f"Features: {list(df_final.columns[:-1])}")print(f"Target distribution: {df_final['target'].value_counts().to_dict()}")print(f"Missing values: {df_final.isnull().sum().sum()}")print("\nReal Heart Disease UCI dataset loaded and preprocessed!")

In [None]:
# Load the dataset
df = pd.read_csv('data/heart_disease.csv')

# Display basic information about the dataset
print("Dataset Overview:")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())


In [None]:
# Check for missing values
print("Missing Values Analysis:")
print("=" * 30)
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
if missing_values.sum() > 0:
    plt.figure(figsize=(10, 6))
    missing_values[missing_values > 0].plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.ylabel('Number of Missing Values')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")


In [None]:
# Handle missing values
print("Handling Missing Values:")
print("=" * 25)

# Create a copy for preprocessing
df_processed = df.copy()

# Fill missing values with median for numerical columns
numerical_columns = df_processed.select_dtypes(include=[np.number]).columns
for col in numerical_columns:
    if df_processed[col].isnull().sum() > 0:
        median_value = df_processed[col].median()
        df_processed[col].fillna(median_value, inplace=True)
        print(f"Filled {col} missing values with median: {median_value}")

# Verify no missing values remain
print(f"\nMissing values after preprocessing: {df_processed.isnull().sum().sum()}")

# Display the processed dataset info
print("\nProcessed Dataset Info:")
print(df_processed.info())


In [None]:
# Exploratory Data Analysis (EDA)
print("Exploratory Data Analysis:")
print("=" * 30)

# Target variable distribution
plt.figure(figsize=(15, 10))

# 1. Target distribution
plt.subplot(2, 3, 1)
target_counts = df_processed['target'].value_counts()
plt.pie(target_counts.values, labels=['No Heart Disease', 'Heart Disease'], autopct='%1.1f%%')
plt.title('Target Variable Distribution')

# 2. Age distribution by target
plt.subplot(2, 3, 2)
df_processed.boxplot(column='age', by='target', ax=plt.gca())
plt.title('Age Distribution by Heart Disease')
plt.suptitle('')  # Remove default title

# 3. Sex distribution by target
plt.subplot(2, 3, 3)
sex_target = pd.crosstab(df_processed['sex'], df_processed['target'])
sex_target.plot(kind='bar', ax=plt.gca())
plt.title('Sex vs Heart Disease')
plt.xticks([0, 1], ['Female', 'Male'], rotation=0)

# 4. Chest pain type distribution
plt.subplot(2, 3, 4)
cp_target = pd.crosstab(df_processed['cp'], df_processed['target'])
cp_target.plot(kind='bar', ax=plt.gca())
plt.title('Chest Pain Type vs Heart Disease')
plt.xticks([0, 1, 2, 3], ['Typical', 'Atypical', 'Non-anginal', 'Asymptomatic'], rotation=45)

# 5. Blood pressure distribution
plt.subplot(2, 3, 5)
df_processed.boxplot(column='trestbps', by='target', ax=plt.gca())
plt.title('Blood Pressure by Heart Disease')
plt.suptitle('')

# 6. Cholesterol distribution
plt.subplot(2, 3, 6)
df_processed.boxplot(column='chol', by='target', ax=plt.gca())
plt.title('Cholesterol by Heart Disease')
plt.suptitle('')

plt.tight_layout()
plt.show()


In [None]:
# Correlation Analysis
plt.figure(figsize=(12, 8))

# Correlation heatmap
correlation_matrix = df_processed.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

# Print correlation with target variable
print("Correlation with Target Variable:")
print("=" * 35)
target_corr = correlation_matrix['target'].drop('target').sort_values(key=abs, ascending=False)
for feature, corr in target_corr.items():
    print(f"{feature:12}: {corr:6.3f}")


In [None]:
# Feature Scaling
print("Feature Scaling:")
print("=" * 15)

# Separate features and target
X = df_processed.drop('target', axis=1)
y = df_processed['target']

# Apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for easier handling
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("Original features statistics:")
print(X.describe())
print("\nScaled features statistics:")
print(X_scaled_df.describe())

# Save the preprocessed data
X_scaled_df.to_csv('data/X_scaled.csv', index=False)
y.to_csv('data/y_target.csv', index=False)

print(f"\nPreprocessed data saved:")
print(f"Features shape: {X_scaled_df.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")
