# Handling Missing Data

In [None]:
# SETUP: Import libraries and configure display
import pandas as pd
import numpy as np


# Configure pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.precision', 2)


In [None]:
# Input File path (origin: www.kaggle.com/)
input_file = "https://rcs.bu.edu/examples/python/DataAnalysis/Heart_Disease_Prediction.csv"

# Reading only the first 100 records for now
df = pd.read_csv(input_file)

# Replace spaces with underscores in all column names that have spaces
df.columns = df.columns.str.replace(' ', '_')

 Let's execute the `describe()` method again and inspect the output. Let's pay attention to the `count` row:

In [None]:
df.describe()

We can see that the number of non-missing observations varies between different columns (variables or "features").

---

##  Detecting Missing Data

Missing data (also called **null**, **NA**, or **NaN** values) is common in biological datasets and can arise from:
- Measurement failures or equipment errors
- Samples that were not collected
- Data entry errors or omissions
- Privacy concerns (redacted values)
- Biological constraints (e.g., some tests not applicable to all patients)

Let's explore methods to identify missing data in our dataset.

Using **`isnull()` or `isna()`** 

Both methods are equivalent and return `True` for missing values, `False` for non-missing values.

In [None]:
# Check for missing values in a particular column (e.g., 'BP')
print("Missing values in the BP column:")
print(df['BP'].isnull().head(10))



In [None]:
# Alternative: isna() does the same thing
df['BP'].isna().head()

The dataset can be very large. It would be hard to find all missing data by simply searching for them.
Instead we can find them using the `index` attribute:

In [None]:
# Find indecies of rows with missing values in 'BP' column
df[df['BP'].isnull()].index

In most cases, it's important to compute the number of missing values. We will use the `sum()` method:
The `.sum()` will be applied after the `isnull()` method and count the number of `True` values (missing data) in each column.

In [None]:
# Count missing values per column
missing_counts = df.isnull().sum()
print("Missing values per column:")
print(missing_counts)

print(f"\nTotal missing values in dataset: {df.isnull().sum().sum()}")

Even more informative than a simple raw counts, is the **percentage of missing data**, especially for large datasets.

In [None]:
# Calculate percentage of missing data per column
(df.isnull().sum() / len(df)) * 100

## Removing rows with missing values

In [None]:
# First let's create a sample dataset with intentional missing values for demonstration
np.random.seed(42)

sample_data = {
    'patient_id': range(1, 11),
    'age': [25, 30, np.nan, 45, 50, np.nan, 60, 65, 70, 75],
    'cholesterol': [180, np.nan, 220, 230, np.nan, 250, 260, 270, np.nan, 290],
    'blood_pressure': [120, 125, 130, np.nan, 140, 145, np.nan, 155, 160, 165],
    'treatment_group': ['A', 'B', 'A', np.nan, 'B', 'A', 'B', np.nan, 'A', 'B']
}

df_sample = pd.DataFrame(sample_data)

print("Sample dataset with missing values:")
print(df_sample)
print("\nMissing value counts:")
print(df_sample.isnull().sum())

In [None]:
# Drop rows with ANY missing value
df_drop_any = df_sample.dropna()
print(f"Original rows: {len(df_sample)}")
print(f"After dropping rows with any NaN: {len(df_drop_any)}")
print("\nResulting DataFrame:")
print(df_drop_any)

In [None]:
# Drop rows only if ALL values are missing
df_drop_all = df_sample.dropna(how='all')
print(f"\nAfter dropping rows where all values are NaN the number of rows is: {len(df_drop_all)}\n")
print(df_drop_all)

In [None]:
# Drop rows with missing values in SPECIFIC columns
df_drop_subset = df_sample.dropna(subset=['age', 'cholesterol'])
print(f"\nAfter dropping rows with NaN in age or cholesterol: {len(df_drop_subset)}\n")
print(df_drop_subset)

 ## Fill Missing Values with Statistical Measures (Mean, Median, Mode)

In [None]:
# Create a copy to preserve original
df_filled = df_sample.copy()

# Fill with MEAN (for numerical columns, sensitive to outliers)
age_mean = df_sample['age'].mean()
df_filled['age'] = df_sample['age'].fillna( age_mean )
print("Age column before and after filling with mean:\n")
print(df_sample['age'],df_filled['age'])

In [None]:
# Fill with MEDIAN (for numerical columns, robust to outliers)
chol_median = df_sample['cholesterol'].median()
df_filled['cholesterol'] = df_sample['cholesterol'].fillna(chol_median)
print("\nCholesterol after filling with median:\n")
print(df_filled['cholesterol'])

In [None]:
# Fill with MODE (for categorical data - most frequent value)
mode_value = df_sample['treatment_group'].mode()[0]  # mode() returns a Series

df_filled['treatment_group'] = df_sample['treatment_group'].fillna(mode_value)
print(f"\nTreatment group after filling with mode ('{mode_value}'):")
print(df_filled['treatment_group'])

print("\n" + "="*65)
print("Complete DataFrame after imputation:")
print(df_filled)