# Handling Missing Data

In [1]:
# SETUP: Import libraries and configure display
import pandas as pd
import numpy as np


# Configure pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.precision', 2)


In [2]:
# Input File path (origin: www.kaggle.com/)
input_file = "https://rcs.bu.edu/examples/python/DataAnalysis/Heart_Disease_Prediction.csv"

# Reading only the first 100 records for now
df = pd.read_csv(input_file)

# Replace spaces with underscores in all column names that have spaces
df.columns = df.columns.str.replace(' ', '_')

 Let's execute the `describe()` method again and inspect the output. Let's pay attention to the `count` row:

In [3]:
df.describe()

Unnamed: 0,Age,Sex,Chest_pain_type,BP,Cholesterol,FBS_over_120,EKG_results,Max_HR,Exercise_angina,ST_depression,Slope_of_ST,Number_of_vessels_fluro,Thallium
count,269.0,270.0,269.0,266.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.44,0.68,3.17,131.37,249.66,0.15,1.02,149.68,0.33,1.05,1.59,0.67,4.7
std,9.13,0.47,0.95,17.98,51.69,0.36,1.0,23.17,0.47,1.15,0.61,0.94,1.94
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


We can see that the number of non-missing observations varies between different columns (variables or "features").

---

##  Detecting Missing Data

Missing data (also called **null**, **NA**, or **NaN** values) is common in biological datasets and can arise from:
- Measurement failures or equipment errors
- Samples that were not collected
- Data entry errors or omissions
- Privacy concerns (redacted values)
- Biological constraints (e.g., some tests not applicable to all patients)

Let's explore methods to identify missing data in our dataset.

Using **`isnull()` or `isna()`** 

Both methods are equivalent and return `True` for missing values, `False` for non-missing values.

In [4]:
# Check for missing values in a particular column (e.g., 'BP')
print("Missing values in the BP column:")
print(df['BP'].isnull().head(10))



Missing values in the BP column:
0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: BP, dtype: bool


In [5]:
# Alternative: isna() does the same thing
df['BP'].isna().head()

0    False
1    False
2    False
3    False
4    False
Name: BP, dtype: bool

The dataset can be very large. It would be hard to find all missing data by simply searching for them.
Instead we can find them using the `index` attribute:

In [6]:
# Find indecies of rows with missing values in 'BP' column
df[df['BP'].isnull()].index

Index([14, 21, 26, 39], dtype='int64')

In most cases, it's important to compute the number of missing values. We will use the `sum()` method:
The `.sum()` will be applied after the `isnull()` method and count the number of `True` values (missing data) in each column.

In [7]:
# Count missing values per column
missing_counts = df.isnull().sum()
print("Missing values per column:")
print(missing_counts)

print(f"\nTotal missing values in dataset: {df.isnull().sum().sum()}")

Missing values per column:
Age                        1
Sex                        0
Chest_pain_type            1
BP                         4
Cholesterol                0
FBS_over_120               0
EKG_results                0
Max_HR                     0
Exercise_angina            0
ST_depression              0
Slope_of_ST                0
Number_of_vessels_fluro    0
Thallium                   0
Heart_Disease              0
dtype: int64

Total missing values in dataset: 6


Even more informative than a simple raw counts, is the **percentage of missing data**, especially for large datasets.

In [8]:
# Calculate percentage of missing data per column
(df.isnull().sum() / len(df)) * 100

Age                        0.37
Sex                        0.00
Chest_pain_type            0.37
BP                         1.48
Cholesterol                0.00
FBS_over_120               0.00
EKG_results                0.00
Max_HR                     0.00
Exercise_angina            0.00
ST_depression              0.00
Slope_of_ST                0.00
Number_of_vessels_fluro    0.00
Thallium                   0.00
Heart_Disease              0.00
dtype: float64

## Removing rows with missing values

In [9]:
# First let's create a sample dataset with intentional missing values for demonstration
np.random.seed(42)

sample_data = {
    'patient_id': range(1, 11),
    'age': [25, 30, np.nan, 45, 50, np.nan, 60, 65, 70, 75],
    'cholesterol': [180, np.nan, 220, 230, np.nan, 250, 260, 270, np.nan, 290],
    'blood_pressure': [120, 125, 130, np.nan, 140, 145, np.nan, 155, 160, 165],
    'treatment_group': ['A', 'B', 'A', np.nan, 'B', 'A', 'B', np.nan, 'A', 'B']
}

df_sample = pd.DataFrame(sample_data)

print("Sample dataset with missing values:")
print(df_sample)
print("\nMissing value counts:")
print(df_sample.isnull().sum())

Sample dataset with missing values:
   patient_id   age  cholesterol  blood_pressure treatment_group
0           1  25.0        180.0           120.0               A
1           2  30.0          NaN           125.0               B
2           3   NaN        220.0           130.0               A
3           4  45.0        230.0             NaN             NaN
4           5  50.0          NaN           140.0               B
5           6   NaN        250.0           145.0               A
6           7  60.0        260.0             NaN               B
7           8  65.0        270.0           155.0             NaN
8           9  70.0          NaN           160.0               A
9          10  75.0        290.0           165.0               B

Missing value counts:
patient_id         0
age                2
cholesterol        3
blood_pressure     2
treatment_group    2
dtype: int64


In [10]:
# Drop rows with ANY missing value
df_drop_any = df_sample.dropna()
print(f"Original rows: {len(df_sample)}")
print(f"After dropping rows with any NaN: {len(df_drop_any)}")
print("\nResulting DataFrame:")
print(df_drop_any)

Original rows: 10
After dropping rows with any NaN: 2

Resulting DataFrame:
   patient_id   age  cholesterol  blood_pressure treatment_group
0           1  25.0        180.0           120.0               A
9          10  75.0        290.0           165.0               B


In [11]:
# Drop rows only if ALL values are missing
df_drop_all = df_sample.dropna(how='all')
print(f"\nAfter dropping rows where all values are NaN the number of rows is: {len(df_drop_all)}\n")
print(df_drop_all)


After dropping rows where all values are NaN the number of rows is: 10

   patient_id   age  cholesterol  blood_pressure treatment_group
0           1  25.0        180.0           120.0               A
1           2  30.0          NaN           125.0               B
2           3   NaN        220.0           130.0               A
3           4  45.0        230.0             NaN             NaN
4           5  50.0          NaN           140.0               B
5           6   NaN        250.0           145.0               A
6           7  60.0        260.0             NaN               B
7           8  65.0        270.0           155.0             NaN
8           9  70.0          NaN           160.0               A
9          10  75.0        290.0           165.0               B


In [12]:
# Drop rows with missing values in SPECIFIC columns
df_drop_subset = df_sample.dropna(subset=['age', 'cholesterol'])
print(f"\nAfter dropping rows with NaN in age or cholesterol: {len(df_drop_subset)}\n")
print(df_drop_subset)


After dropping rows with NaN in age or cholesterol: 5

   patient_id   age  cholesterol  blood_pressure treatment_group
0           1  25.0        180.0           120.0               A
3           4  45.0        230.0             NaN             NaN
6           7  60.0        260.0             NaN               B
7           8  65.0        270.0           155.0             NaN
9          10  75.0        290.0           165.0               B


 ## Fill Missing Values with Statistical Measures (Mean, Median, Mode)

In [13]:
# Create a copy to preserve original
df_filled = df_sample.copy()

# Fill with MEAN (for numerical columns, sensitive to outliers)
age_mean = df_sample['age'].mean()
df_filled['age'] = df_sample['age'].fillna( age_mean )
print("Age column before and after filling with mean:\n")
print(df_sample['age'],df_filled['age'])

Age column before and after filling with mean:

0    25.0
1    30.0
2     NaN
3    45.0
4    50.0
5     NaN
6    60.0
7    65.0
8    70.0
9    75.0
Name: age, dtype: float64 0    25.0
1    30.0
2    52.5
3    45.0
4    50.0
5    52.5
6    60.0
7    65.0
8    70.0
9    75.0
Name: age, dtype: float64


In [14]:
# Fill with MEDIAN (for numerical columns, robust to outliers)
chol_median = df_sample['cholesterol'].median()
df_filled['cholesterol'] = df_sample['cholesterol'].fillna(chol_median)
print("\nCholesterol after filling with median:\n")
print(df_filled['cholesterol'])


Cholesterol after filling with median:

0    180.0
1    250.0
2    220.0
3    230.0
4    250.0
5    250.0
6    260.0
7    270.0
8    250.0
9    290.0
Name: cholesterol, dtype: float64


In [15]:
# Fill with MODE (for categorical data - most frequent value)
mode_value = df_sample['treatment_group'].mode()[0]  # mode() returns a Series

df_filled['treatment_group'] = df_sample['treatment_group'].fillna(mode_value)
print(f"\nTreatment group after filling with mode ('{mode_value}'):")
print(df_filled['treatment_group'])

print("\n" + "="*65)
print("Complete DataFrame after imputation:")
print(df_filled)


Treatment group after filling with mode ('A'):
0    A
1    B
2    A
3    A
4    B
5    A
6    B
7    A
8    A
9    B
Name: treatment_group, dtype: object

Complete DataFrame after imputation:
   patient_id   age  cholesterol  blood_pressure treatment_group
0           1  25.0        180.0           120.0               A
1           2  30.0        250.0           125.0               B
2           3  52.5        220.0           130.0               A
3           4  45.0        230.0             NaN               A
4           5  50.0        250.0           140.0               B
5           6  52.5        250.0           145.0               A
6           7  60.0        260.0             NaN               B
7           8  65.0        270.0           155.0               A
8           9  70.0        250.0           160.0               A
9          10  75.0        290.0           165.0               B
