# Downcasting Seaborn Test Datasets

In [6]:
# Import Libraries
import pandas as pd
import seaborn as sns

In [7]:
# define the function (you can copy and paste it for your DataFrame!)

def downcast(df: pd.DataFrame) -> pd.DataFrame:
    
    ''' Compression of the common dtypes "float64", "int64", "object" or "string" '''

    # memory before downcasting
    mem_before = df.memory_usage(deep=True).sum()
    mem_before_mb = round(mem_before / (1024**2), 2)

    # convert the dataframe columns to appropriate dtypes (e.g. object to string, or 1.0 float to 1 integer, etc.)
    df = df.convert_dtypes()

    # string categorization (only the ones with low cardinality)
    for column in df.select_dtypes(['string', 'object']):
        if (len(df[column].unique()) / len(df[column])) < 0.5:
            df[column] = df[column].astype('category')

    # float64 downcasting
    for column in df.select_dtypes(['float']):
        df[column] = pd.to_numeric(df[column], downcast='float')

    # int64 downcasting (depending if negative values are apparent (='signed') or only >=0 (='unsigned'))
    for column in df.select_dtypes(['integer']):
        if df[column].min() >= 0:
            df[column] = pd.to_numeric(df[column], downcast='unsigned')
        else:
            df[column] = pd.to_numeric(df[column], downcast='signed')

    # memory after downcasting & compression
    mem_after = df.memory_usage(deep=True).sum()
    mem_after_mb = round(mem_after / (1024**2), 2)
    compression = round(((mem_before - mem_after) / mem_before) * 100)

    # downcasting summary
    print(f'DataFrame compressed by {compression}% from {mem_before_mb} MB down to {mem_after_mb} MB.')

    return df

In [8]:
# test the seaborn datasets

df_dict = {df: sns.load_dataset(df) for df in sns.get_dataset_names()}

results = []

for name, df_before in df_dict.items():
    size_before = round(df_before.memory_usage(deep=True).sum() / (1024**2), 3)
    df_after = downcast(df_before)
    size_after = round(df_after.memory_usage(deep=True).sum() / (1024**2), 3)
    downsized = round(((size_before - size_after) / size_before) * 100)
    results.append(
        {'dataset': name, 'size_before': size_before, 'size_after': size_after, 'downsized_by': downsized}
    )

results_df = pd.DataFrame(results).sort_values('downsized_by', ascending=False) \
                                  .reset_index(drop=True) \
                                  .style.format({'size_before': '{:,.2f} MB',
                                                 'size_after': '{:,.2f} MB',
                                                 'downsized_by': '{:,.0f} %'}) \
                                        .bar(subset='downsized_by', color='darkgreen')
                                        
results_df

DataFrame compressed by 71% from 0.0 MB down to 0.0 MB.
DataFrame compressed by 74% from 0.0 MB down to 0.0 MB.
DataFrame compressed by 82% from 0.01 MB down to 0.0 MB.
DataFrame compressed by 0% from 4.13 MB down to 4.13 MB.
DataFrame compressed by 18% from 0.01 MB down to 0.0 MB.
DataFrame compressed by 39% from 3.04 MB down to 1.85 MB.
DataFrame compressed by 89% from 0.12 MB down to 0.01 MB.
DataFrame compressed by 48% from 0.0 MB down to 0.0 MB.
DataFrame compressed by 40% from 0.0 MB down to 0.0 MB.
DataFrame compressed by 94% from 0.2 MB down to 0.01 MB.
DataFrame compressed by 84% from 0.48 MB down to 0.07 MB.
DataFrame compressed by 88% from 0.02 MB down to 0.0 MB.
DataFrame compressed by 76% from 0.01 MB down to 0.0 MB.
DataFrame compressed by 48% from 0.07 MB down to 0.04 MB.
DataFrame compressed by 91% from 0.07 MB down to 0.01 MB.
DataFrame compressed by 80% from 0.11 MB down to 0.02 MB.
DataFrame compressed by 36% from 0.01 MB down to 0.0 MB.
DataFrame compressed by 91% f

Unnamed: 0,dataset,size_before,size_after,downsized_by
0,fmri,0.20 MB,0.01 MB,94 %
1,titanic,0.31 MB,0.03 MB,91 %
2,geyser,0.02 MB,0.00 MB,90 %
3,penguins,0.07 MB,0.01 MB,90 %
4,dots,0.12 MB,0.01 MB,89 %
5,gammas,0.48 MB,0.07 MB,84 %
6,attention,0.01 MB,0.00 MB,83 %
7,iris,0.01 MB,0.00 MB,79 %
8,planets,0.11 MB,0.02 MB,79 %
9,anscombe,0.00 MB,0.00 MB,67 %
