This notebook shows how to use function "make_variable_report" to show a nice variable report of a dataframe.

In [1]:
# ================================
# functions for variable report
# ================================

import pandas as pd
import numpy as np


def get_sample_data(acol, num_example=15, cardinality_threshold=20, rndnum=13):
    """
    acol: a column of pandas dataframe
    num_example: number of examples to display
    cardinality_threshold: The maximum number of unique values to display.
                           In some situations there are too many unique values, for instance, id, address, username, etc.,
                           and we don't want to display all of them.
    rndnum: The random number seed when shuffle the index of sample to show.

    kittipat@gmail.com
    Dec 30, 2014
    """

    cnt_not_NULL = acol.count()
    null_idx = pd.isnull(acol)
    cnt_NULL = sum(null_idx)
    notnull_sample = acol[~null_idx]

    # check cardinality of variables
    unique_sample = notnull_sample.unique()
    cardinality = len(unique_sample)

    # shuffle the index and sample the notnull values
    idx = np.arange(len(notnull_sample))
    np.random.seed(rndnum)
    np.random.shuffle(idx)

    if num_example <= len(notnull_sample):
        notnull_sample = notnull_sample.iloc[idx[:num_example]]

    # prepare output
    var_report = {}
    var_report['var_type'] = str(acol.dtypes)
    var_report['total_length'] = len(acol)
    var_report['cnt_not_NULL'] = cnt_not_NULL
    var_report['cnt_NULL'] = cnt_NULL
    var_report['notnull_sample'] = notnull_sample
    var_report['cardinality'] = cardinality
    if cardinality <= cardinality_threshold:
        var_report['unique_sample'] = unique_sample
    else:
        var_report['unique_sample'] = unique_sample[:cardinality_threshold]

    return var_report


def make_variable_report(df, col_name_list, num_example=15, cardinality_threshold=20, rndnum=23):
    """
    df: pandas dataframe
    col_name_list: list of column names to apply this function

    Please refer to function get_sample_data for the definition of the following variables:
    num_example
    cardinality_threshold
    rndnum

    Note: I ran this on a dataset of 1.09M x 98 and it took only 30 second to finish.

    Example:

    import df_report as dr
    my_df_report = dr.make_variable_report(df, df.columns.values)
    my_df_report

    kittipat@gmail.com
    Dec 30, 2014
    """
    report_var = {'var_name':[], 'var_type':[], 'length':[], 'not_missing':[], 'missing':[], 'sample':[], 'cardinality':[], 'unique_values':[]}

    for i in range(len(col_name_list)):

        col_name = col_name_list[i]
        acol = df[col_name]
        acol_dict = get_sample_data(acol=acol,
                                    num_example=num_example,
                                    cardinality_threshold=cardinality_threshold,
                                    rndnum=rndnum)

        # put everything into dict
        report_var['var_name'].append(col_name)
        report_var['var_type'].append(acol_dict['var_type'])
        report_var['length'].append(acol_dict['total_length'])
        report_var['not_missing'].append(acol_dict['cnt_not_NULL'])
        report_var['missing'].append(acol_dict['cnt_NULL'])
        report_var['sample'].append(str(acol_dict['notnull_sample'].tolist()).strip('[]'))
        report_var['cardinality'].append(acol_dict['cardinality'])
        report_var['unique_values'].append(acol_dict['unique_sample'])

    df_report_var = pd.DataFrame(report_var, columns=['var_name', 'var_type', 'length',
                                                      'not_missing', 'missing', 'cardinality',
                                                      'unique_values', 'sample'])

    return df_report_var


In [2]:
# load data from
# https://www.kaggle.com/c/titanic/data
# and import to python
# import the cosmos variable data
datadir = '/Users/kittipat/Dropbox/research/python_dev/datasets/'
infilename = 'titanic_train'
tp = pd.read_csv(filepath_or_buffer=''.join([datadir,infilename,'.csv']), sep=',',na_values=['.',''],header=0,iterator=True, chunksize=1000) 
df = pd.concat(list(tp), ignore_index=True) 

print("data shape:",df.shape)

data shape: (891, 12)


In [4]:
# create a report of df
df_report = make_variable_report(df, df.columns)

# Display the report
from IPython.display import HTML
HTML(df_report.to_html())

Unnamed: 0,var_name,var_type,length,not_missing,missing,cardinality,unique_values,sample
0,PassengerId,int64,891,891,0,891,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","728, 669, 211, 655, 229, 365, 64, 653, 230, 66..."
1,Survived,int64,891,891,0,2,"[0, 1]","1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0"
2,Pclass,int64,891,891,0,3,"[3, 1, 2]","3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2"
3,Name,object,891,891,0,891,"[Braund, Mr. Owen Harris, Cumings, Mrs. John B...","'Mannion, Miss. Margareth', 'Cook, Mr. Jacob',..."
4,Sex,object,891,891,0,2,"[male, female]","'female', 'male', 'male', 'female', 'male', 'm..."
5,Age,float64,891,714,177,88,"[22.0, 38.0, 26.0, 35.0, 54.0, 2.0, 27.0, 14.0...","42.0, 52.0, 28.0, 66.0, 19.0, 18.0, 21.0, 40.0..."
6,SibSp,int64,891,891,0,7,"[1, 0, 3, 4, 2, 5, 8]","0, 0, 0, 0, 0, 1, 3, 0, 3, 0, 0, 0, 0, 0, 0"
7,Parch,int64,891,891,0,7,"[0, 1, 2, 5, 3, 4, 6]","0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0"
8,Ticket,object,891,891,0,681,"[A/5 21171, PC 17599, STON/O2. 3101282, 113803...","'36866', 'A/5 3536', 'SOTON/O.Q. 3101311', '36..."
9,Fare,float64,891,891,0,248,"[7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51....","7.7374999999999998, 8.0500000000000007, 7.0499..."
