# Ischemic Heart Disease (IHD) Diagnosis - Data Exploration

This notebook explores the dataset for the IHD diagnosis project.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Load Data

Load the dataset from the SPSS file.

In [None]:
# Load data
data = pd.read_spss('../data/ASDS_Study_Data.sav')

# Display basic information
print(f"Dataset shape: {data.shape}")
data.info()

## Clean Column Names

Clean and standardize column names.

In [None]:
# Define new column names
new_columns = [
    'Age (Years)', 'Sex (Male/Female)', 'Occupation Type', 'Education Level', 'Economic Status',
    'Height (cm)', 'Weight (kg)', 'Systolic Blood Pressure (mmHg)', 'Diastolic Blood Pressure (mmHg)',
    'Random Blood Sugar (mg/dL)', 'Smoking Status', 'Hypertension (HTN) Status',
    'Diabetes Mellitus (DM) Status', 'Dyslipidemia Status', 'Stroke Status', 'Ischemic Heart Disease (IHD) Status',
    'Age Group', 'Body Mass Index (BMI) Group', 'Hypertension Stage'
]

# Check if the lengths match
print(f"Old Columns: {len(data.columns)}, New Columns: {len(new_columns)}")

# Rename columns
data.columns = new_columns

# Display column names
data.columns

## Check Missing Values

Check for missing values in the dataset.

In [None]:
# Check for missing values
missing_values = data.isnull().sum()

# Display columns with missing values
print("Missing Values in Each Column:")
print(missing_values[missing_values > 0])

# Display the total number of missing values
total_missing = data.isnull().sum().sum()
print(f"\nTotal Missing Values in the Dataset: {total_missing}")

## Handle Missing Values

Handle missing values in the dataset.

In [None]:
# Fill missing values
if 'Random Blood Sugar (mg/dL)' in data.columns:
    data['Random Blood Sugar (mg/dL)'].fillna(data['Random Blood Sugar (mg/dL)'].median(), inplace=True)

# Check for missing values again
missing_values = data.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values[missing_values > 0])

# Display the total number of missing values
total_missing = data.isnull().sum().sum()
print(f"\nTotal Missing Values in the Dataset: {total_missing}")

## Descriptive Statistics

Calculate descriptive statistics for the dataset.

In [None]:
# Descriptive statistics for numerical columns
data.describe(include=[np.number]).T

In [None]:
# Descriptive statistics for categorical columns
data.describe(include=['category', 'object']).T

## Target Variable Distribution

Explore the distribution of the target variable.

In [None]:
# Target variable distribution
target_counts = data['Ischemic Heart Disease (IHD) Status'].value_counts()
target_percentage = target_counts / target_counts.sum() * 100

# Create a dataframe for display
target_distribution = pd.DataFrame({
    'Count': target_counts,
    'Percentage': target_percentage
})

print("Target Variable Distribution:")
print(target_distribution)

In [None]:
# Plot target variable distribution
plt.figure(figsize=(10, 6))

# Create bar plot
ax = sns.barplot(x=target_counts.index, y=target_counts.values, palette='viridis')

# Add count and percentage labels
for i, (count, percentage) in enumerate(zip(target_counts, target_percentage)):
    ax.text(i, count + 5, f'{count} ({percentage:.1f}%)', ha='center')

# Set labels
plt.title('Ischemic Heart Disease (IHD) Status Distribution')
plt.xlabel('Ischemic Heart Disease (IHD) Status')
plt.ylabel('Count')

plt.show()

## Categorical Features Distribution

Explore the distribution of categorical features.

In [None]:
# Define categorical columns
categorical_cols = [
    'Sex (Male/Female)', 'Occupation Type', 'Education Level',
    'Economic Status', 'Smoking Status', 'Hypertension (HTN) Status',
    'Diabetes Mellitus (DM) Status', 'Dyslipidemia Status', 'Stroke Status',
    'Age Group', 'Body Mass Index (BMI) Group', 'Hypertension Stage'
]

# Plot categorical features distribution
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    
    # Create countplot
    ax = sns.countplot(x=data[col], hue=data['Ischemic Heart Disease (IHD) Status'], palette='viridis')
    
    # Set labels
    plt.title(f'{col} vs Ischemic Heart Disease (IHD) Status')
    plt.xlabel(col)
    plt.ylabel('Count')
    
    # Rotate x-axis labels if needed
    if len(data[col].unique()) > 5:
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

## Numerical Features Distribution

Explore the distribution of numerical features.

In [None]:
# Define numerical columns
numerical_cols = [
    'Age (Years)', 'Height (cm)', 'Weight (kg)',
    'Systolic Blood Pressure (mmHg)', 'Diastolic Blood Pressure (mmHg)',
    'Random Blood Sugar (mg/dL)'
]

# Plot numerical features distribution
for col in numerical_cols:
    plt.figure(figsize=(12, 5))
    
    # Create subplot grid
    plt.subplot(1, 2, 1)
    
    # Histogram
    sns.histplot(data=data, x=col, hue='Ischemic Heart Disease (IHD) Status', kde=True, palette='viridis')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    
    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(data=data, x='Ischemic Heart Disease (IHD) Status', y=col, palette='viridis')
    plt.title(f'{col} by Ischemic Heart Disease (IHD) Status')
    plt.xlabel('Ischemic Heart Disease (IHD) Status')
    plt.ylabel(col)
    
    plt.tight_layout()
    plt.show()

## Correlation Analysis

Analyze correlations between features.

In [None]:
# Calculate correlation matrix
corr = data.corr()

# Plot correlation matrix
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    vmin=-1,
    center=0,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5},
    annot=True,
    fmt=".2f"
)

plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## Feature Engineering

Create new features from existing ones.

In [None]:
# Calculate BMI
data['BMI'] = data['Weight (kg)'] / ((data['Height (cm)'] / 100) ** 2)
print(f"BMI statistics:\n{data['BMI'].describe()}")

# Calculate pulse pressure
data['Pulse Pressure'] = data['Systolic Blood Pressure (mmHg)'] - data['Diastolic Blood Pressure (mmHg)']
print(f"\nPulse Pressure statistics:\n{data['Pulse Pressure'].describe()}")

# Calculate mean arterial pressure
data['Mean Arterial Pressure'] = (
    data['Diastolic Blood Pressure (mmHg)'] + 
    (1/3) * (data['Systolic Blood Pressure (mmHg)'] - data['Diastolic Blood Pressure (mmHg)'])
)
print(f"\nMean Arterial Pressure statistics:\n{data['Mean Arterial Pressure'].describe()}")

## Outlier Detection

Detect outliers in numerical features.

In [None]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Detect outliers in numerical columns
for col in numerical_cols:
    outliers, lower_bound, upper_bound = detect_outliers_iqr(data, col)
    
    print(f"\nOutliers in {col}:")
    print(f"Number of outliers: {len(outliers)}")
    print(f"Lower bound: {lower_bound:.2f}")
    print(f"Upper bound: {upper_bound:.2f}")
    
    if len(outliers) > 0:
        plt.figure(figsize=(10, 6))
        
        # Create boxplot
        sns.boxplot(x=data[col], palette='viridis')
        
        # Add vertical lines for bounds
        plt.axvline(x=lower_bound, color='r', linestyle='--', label=f'Lower Bound: {lower_bound:.2f}')
        plt.axvline(x=upper_bound, color='r', linestyle='--', label=f'Upper Bound: {upper_bound:.2f}')
        
        # Set labels
        plt.title(f'Boxplot of {col} with Outlier Bounds')
        plt.xlabel(col)
        plt.legend()
        
        plt.tight_layout()
        plt.show()

## Conclusion

Summarize the findings from the exploratory data analysis.

In [None]:
# Print summary statistics
print("Dataset Summary:")
print(f"Number of samples: {len(data)}")
print(f"Number of features: {len(data.columns) - 1}")
print(f"Target variable: Ischemic Heart Disease (IHD) Status")
print(f"Target distribution: {dict(target_counts)}")
print(f"Missing values: {total_missing}")

# Print key findings
print("\nKey Findings:")
print("1. [Add your findings here]")
print("2. [Add your findings here]")
print("3. [Add your findings here]")