# Dealing with missing data


In [None]:
import pandas as pd
import numpy as np

In [None]:
# Create a DataFrame with some NaN values
data = {
    "A": [1.0, 5.0, 10.0],
    "B": [2.0, 6.0, 11.0],
    "C": [3.0, np.nan, 12.0],
    "D": [4.0, 8.0, np.nan],
}

In [None]:
# Create the DataFrame
df = pd.DataFrame(data)
display(df)

In [None]:
# Check for missing values in each column
df.isnull().sum()

## Eliminating samples or features with missing values


In [None]:
# Drop rows with any missing values
df.dropna()

In [None]:
# Drop columns with any missing values
df.dropna(axis=1)

In [None]:
# Introduce a NaN value for demonstration
df.loc[3] = np.nan
display(df)

In [None]:
# Drop rows where all elements are NaN
df = df.dropna(how="all")
display(df)

In [None]:
# Drop rows with less than 4 non-NaN values
df.dropna(thresh=4)

In [None]:
# Drop rows with missing values in column 'C'
df.dropna(subset=["C"])

# Filling in missing data


In [None]:
# Display the final DataFrame
display(df)

In [None]:
# Fill missing values with 0
dft = df.fillna(0)
display(dft)

In [None]:
# Fill missing values with -1
dft = df.fillna(-1)
display(dft)

In [None]:
# Show mean of each column, ignoring NaN values
df.mean()

In [None]:
# Fill missing values with the mean of each column. This is called mean imputation.
dft = df.fillna(df.mean())
display(dft)