In [None]:
#| default_exp tabular_summaries

# Tabular summaries

> This section introduces you to tabular summaries for the following variables.

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from pandas_missing.Missing import PandasMissingDataFrame, PandasMissingSeries
from fastcore.basics import patch

# Series tabular summaries

# DataFrame tabular summaries

## Variables

In [None]:
df = pd.DataFrame.from_dict(
    {
       "a": range(0, 10),
       "b": range(10, 20),
       "c": range(20, 30),
       "d": range(30, 40),
       "e": range(40, 50)
    }
)

df.iloc[1:4, 0] = None
df.iloc[9, 0] = None
df.iloc[5:7, 1] = None

df

Unnamed: 0,a,b,c,d,e
0,0.0,10.0,20,30,40
1,,11.0,21,31,41
2,,12.0,22,32,42
3,,13.0,23,33,43
4,4.0,14.0,24,34,44
5,5.0,,25,35,45
6,6.0,,26,36,46
7,7.0,17.0,27,37,47
8,8.0,18.0,28,38,48
9,,19.0,29,39,49


In [None]:
#| export
@patch
def summarize_variable_missing(
    self: PandasMissingDataFrame,
    sort: bool = True, # Indicate whether to sort the result by `number_missing`.
    ascending: bool = False ,# Sort ascending vs. descending. Only applicable when sort is `True`.
    add_cumsum: bool = False # Indicate wheter or not to add the cumulative sum of missings. **Note**: cumsum is calculated based on input data.
) -> pd.DataFrame: # A pandas DataFrame containing the following columns: `variable`, `number_missing`, `proportion_missing`, `percentage_missing`, and optionally`number_missing_cumsum`.
    """Summarize the missingness in each variable.
    The summary always includes the number, proportion and percentage of missings.
    Besides, it could include the cumulative sum of missings.
    """
    return (
        self._df
        .isna()
        .sum()
        .reset_index(name="number_missing")
        .rename(columns={"index": "variable"})
        .assign(
            proportion_missing=lambda df: df.number_missing / df.number_missing.sum(),
            percentage_missing=lambda df: df.proportion_missing * 100        
        )
        .pipe(
            lambda df: df if not add_cumsum else (
                df
                .assign(
                number_missing_cumsum=lambda internal_df: internal_df.number_missing.cumsum()
                )
            )
        )
        .pipe(
            lambda df: df if not sort else df.sort_values(
                by="number_missing",
                ascending=ascending
            )
        )
    )

In [None]:
df.missing.summarize_variable_missing(sort=True, ascending=True, add_cumsum=True)

Unnamed: 0,variable,number_missing,proportion_missing,percentage_missing,number_missing_cumsum
2,c,0,0.0,0.0,6
3,d,0,0.0,0.0,6
4,e,0,0.0,0.0,6
1,b,2,0.333333,33.333333,6
0,a,4,0.666667,66.666667,4


In [None]:
#| export
@patch
def tabulate_variable_missing(
    self: PandasMissingDataFrame,
    **kwargs # Extra arguments to be passed to `pd.value_counts()` except or `subset`, and `normalize`.
) -> pd.DataFrame: # A pandas DataFrames.
    """Provide a table of the number of variables with 0, 1, 2, up to `n`, missing values and the proporton and percentage of the number of variables those variables make up."""
    
    return (
        self._df
        .missing.summarize_variable_missing()
        .value_counts(
            subset="number_missing",
            normalize=False,
            **kwargs
        )
        .reset_index(name="number_variables")
        .assign(
            proportion_variables=lambda df: df.number_variables / df.number_variables.sum(),
            percentage_variables=lambda df: df.proportion_variables * 100
        )
    )

In [None]:
df.missing.tabulate_variable_missing()

Unnamed: 0,number_missing,number_variables,proportion_variables,percentage_variables
0,0,3,0.6,60.0
1,2,1,0.2,20.0
2,4,1,0.2,20.0


## Cases