# Handling missing values
## Drop missing data


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

covid_totals = pd.read_csv("data/covidtotals.csv")


In [2]:
# inspect NaN
covid_totals.isna().sum().sort_values(ascending=False)

vac_per_hund       218
hosp_beds           61
hum_dev_ind         44
aged_65_older       43
gdp_per_capita      40
median_age          37
pop_density         22
life_expectancy      4
iso_code             0
lastdate             0
population           0
total_deaths_pm      0
total_cases_pm       0
total_deaths         0
total_cases          0
location             0
region               0
dtype: int64

In [3]:
# if we drop na for life_expectancy and pop_density
drop_na_df = covid_totals.dropna(subset=['life_expectancy','pop_density'])

In [4]:
# check the NaN values for those two columns, shows 0 missing values
drop_na_df[['life_expectancy','pop_density']].isna().sum()

life_expectancy    0
pop_density        0
dtype: int64

## Fill missing numerical data


In [5]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "person": ["A", "B", "C", "D", "E"],
    "income": [50000, 52000, np.nan, 51000, 50000000]  # extreme outlier
})

print(df)


  person      income
0      A     50000.0
1      B     52000.0
2      C         NaN
3      D     51000.0
4      E  50000000.0


### Fill with Mean (Sensitive to Outliers):
Because of the $50 million outlier:
- Mean becomes very large
- The missing value becomes unrealistically high

In [6]:

mean_income = df["income"].mean()
print("Mean income:", mean_income)

df_mean_filled = df.copy()
df_mean_filled["income"] = df_mean_filled["income"].fillna(mean_income)

print("\nAfter mean filling:")
print(df_mean_filled)


Mean income: 12538250.0

After mean filling:
  person      income
0      A     50000.0
1      B     52000.0
2      C  12538250.0
3      D     51000.0
4      E  50000000.0


### Fill with Median (Safer for Skewed Data)
- Median ignores extreme outlier.
- More appropriate for: Income, House prices, Salary, Highly skewed distributions

In [7]:

median_income = df["income"].median()
print("Median income:", median_income)

df_median_filled = df.copy()
df_median_filled["income"] = df_median_filled["income"].fillna(median_income)

print("\nAfter median filling:")
print(df_median_filled)


Median income: 51500.0

After median filling:
  person      income
0      A     50000.0
1      B     52000.0
2      C     51500.0
3      D     51000.0
4      E  50000000.0


### Forward and backward Fill (Time-Series Example)
temperature example: imaging this is a dataframe with a whole year's temperature. If we use mean or median to fill missing temperature, it will not be as realistic as forward fill or backward fill.

In [18]:
# Create a dataframe with date and temperature
df_time = pd.DataFrame({
    "date": pd.date_range("2024-02-20", periods=5),
    "temperature": [30, 32, np.nan, 31, 33]
})

df_time = df_time.sort_values("date")

print(df_time)


        date  temperature
0 2024-02-20         30.0
1 2024-02-21         32.0
2 2024-02-22          NaN
3 2024-02-23         31.0
4 2024-02-24         33.0


In [19]:
# forward fill:
# Missing temperature on Feb 22: filled with Feb 21 temperature
# Works when: Values change slowly, Data is ordered, Missing gap is small
df_time["temp_ffill"] = df_time["temperature"].ffill()

print(df_time)


        date  temperature  temp_ffill
0 2024-02-20         30.0        30.0
1 2024-02-21         32.0        32.0
2 2024-02-22          NaN        32.0
3 2024-02-23         31.0        31.0
4 2024-02-24         33.0        33.0


In [21]:
df_time["temp_bfill"] = df_time["temperature"].bfill()

print(df_time)


        date  temperature  temp_ffill  temp_bfill
0 2024-02-20         30.0        30.0        30.0
1 2024-02-21         32.0        32.0        32.0
2 2024-02-22          NaN        32.0        31.0
3 2024-02-23         31.0        31.0        31.0
4 2024-02-24         33.0        33.0        33.0


## Filling categorical data
### Fill with "Unknown" or "Undefined"
(When Missingness Is Informative)

In [22]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "gender": ["Female", "Male", np.nan, "Male", np.nan]
})

print("Original Data:")
print(df)


Original Data:
      name  gender
0    Alice  Female
1      Bob    Male
2  Charlie     NaN
3    David    Male
4      Eva     NaN


In [23]:
# Fill NaN with "Unknown"
df_unknown = df.copy()
df_unknown["gender"] = df_unknown["gender"].fillna("Unknown")

print("After filling with 'Unknown':")
print(df_unknown)


After filling with 'Unknown':
      name   gender
0    Alice   Female
1      Bob     Male
2  Charlie  Unknown
3    David     Male
4      Eva  Unknown


In [24]:
# Now "Unknown" becomes a legitimate category.
# You can even check distribution:
print(df_unknown["gender"].value_counts())


gender
Male       2
Unknown    2
Female     1
Name: count, dtype: int64


### Fill with Most Frequent Category (Mode)

In [25]:
df2 = pd.DataFrame({
    "person": ["A", "B", "C", "D", "E", "F"],
    "color": ["Red", "Blue", "Red", np.nan, "Red", np.nan]
})

print("Original Data:")
print(df2)


Original Data:
  person color
0      A   Red
1      B  Blue
2      C   Red
3      D   NaN
4      E   Red
5      F   NaN


In [26]:
# Fill with mode

mode_color = df2["color"].mode()[0]
print("Most frequent color:", mode_color)

df_mode = df2.copy()
df_mode["color"] = df_mode["color"].fillna(mode_color)

print("\nAfter mode filling:")
print(df_mode)


Most frequent color: Red

After mode filling:
  person color
0      A   Red
1      B  Blue
2      C   Red
3      D   Red
4      E   Red
5      F   Red


In [27]:
# This fills missing values with the most common category.
# But now check distribution: “Red” becomes even more dominant, This may exaggerate majority group
print("Distribution after filling:")
print(df_mode["color"].value_counts())


Distribution after filling:
color
Red     5
Blue    1
Name: count, dtype: int64


In [28]:
# Compare Distribution Before and After,
# This clearly shows the distortion.
print("Original distribution (excluding NaN):")
print(df2["color"].value_counts())

print("\nAfter mode filling:")
print(df_mode["color"].value_counts())


Original distribution (excluding NaN):
color
Red     3
Blue    1
Name: count, dtype: int64

After mode filling:
color
Red     5
Blue    1
Name: count, dtype: int64


## Group fill
Example: Delivery Time by Shipping Method

Scenario: Different shipping methods naturally have different delivery times.


In [29]:
import pandas as pd
import numpy as np

df_delivery = pd.DataFrame({
    "order_id": [1, 2, 3, 4, 5, 6],
    "shipping_method": ["Standard", "Standard", "Express", "Express", "Standard", "Express"],
    "delivery_days": [5, np.nan, 2, np.nan, 6, 1]
})

print("Original Data:")
print(df_delivery)


Original Data:
   order_id shipping_method  delivery_days
0         1        Standard            5.0
1         2        Standard            NaN
2         3         Express            2.0
3         4         Express            NaN
4         5        Standard            6.0
5         6         Express            1.0


In [30]:
# Fill With Global Median: Problem:
# - Express shipments get filled with a slower value
# - Standard shipments get filled with a faster value

global_median = df_delivery["delivery_days"].median()

df_global = df_delivery.copy()
df_global["delivery_days"] = df_global["delivery_days"].fillna(global_median)

print("After Global Median Filling:")
print(df_global)


After Global Median Filling:
   order_id shipping_method  delivery_days
0         1        Standard            5.0
1         2        Standard            3.5
2         3         Express            2.0
3         4         Express            3.5
4         5        Standard            6.0
5         6         Express            1.0


In [31]:
# Fill Within Shipping Method

df_group = df_delivery.copy()

df_group["delivery_days"] = df_group.groupby("shipping_method")["delivery_days"].transform(lambda x: x.fillna(x.median()))

print("After Shipping-Method Filling:")
print(df_group)


After Shipping-Method Filling:
   order_id shipping_method  delivery_days
0         1        Standard            5.0
1         2        Standard            5.5
2         3         Express            2.0
3         4         Express            1.5
4         5        Standard            6.0
5         6         Express            1.0


## Inferring missing data

This is a simplified version of a real research problem I encountered.

We will use a rule-based approach to infer the missing organization types. You will see that:
- We didn’t randomly fill.
- We used external information (name patterns).
- This is inference-based imputation.
- It’s better than filling with mode.

In reality, because I have thousands of missing data here, also regular expression rule-based approach can not capture corner cases systematically, I used an LLM to infer.

If you are interested, check my paper *Gender disparity in U.S. patenting*:
- [paper link](https://www.nature.com/articles/s41599-025-06038-6)
- [Appendices](https://static-content.springer.com/esm/art%3A10.1057%2Fs41599-025-06038-6/MediaObjects/41599_2025_6038_MOESM1_ESM.pdf), specifically, check Appendix C

In [34]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "assignee_name": [
        "International Business Machines Corp",
        "Stony Brook University",
        "Jaff Bazos",
        "U.S. Department of Energy",
        "Tesla Inc",
        "Elun Mosk",
        "MIT",
        "Google LLC"
    ],
    "org_type": [
        "Corporation",
        "University",
        np.nan,
        "Government",
        np.nan,
        np.nan,
        np.nan,
        "Corporation"
    ]
})

print("Original Data:")
print(df)


Original Data:
                          assignee_name     org_type
0  International Business Machines Corp  Corporation
1                Stony Brook University   University
2                            Jaff Bazos          NaN
3             U.S. Department of Energy   Government
4                             Tesla Inc          NaN
5                             Elun Mosk          NaN
6                                   MIT          NaN
7                            Google LLC  Corporation


In [35]:
# Identify Missing Organization Types

print("Missing org_type count:")
print(df["org_type"].isna().sum())


Missing org_type count:
4


In [36]:
# This is NOT the best rule-based approach, just an example
def infer_org_type(name):
    name_lower = name.lower()

    if "university" in name_lower or name_lower in ["mit"]:
        return "University"

    elif "department" in name_lower or "u.s." in name_lower:
        return "Government"

    elif "corp" in name_lower or "inc" in name_lower or "llc" in name_lower:
        return "Corporation"

    else:
        return "Individual"


In [37]:
# Apply Only to Missing Values
df_filled = df.copy()

mask = df_filled["org_type"].isna()

df_filled.loc[mask, "org_type"] = \
    df_filled.loc[mask, "assignee_name"].apply(infer_org_type)

print("After Inference-Based Filling:")
print(df_filled)


After Inference-Based Filling:
                          assignee_name     org_type
0  International Business Machines Corp  Corporation
1                Stony Brook University   University
2                            Jaff Bazos   Individual
3             U.S. Department of Energy   Government
4                             Tesla Inc  Corporation
5                             Elun Mosk   Individual
6                                   MIT   University
7                            Google LLC  Corporation
