In [1]:
# default_exp pandas

# Pandas related functions
> Some useful functions to effectively work with pandas

> Inspired by: https://www.dataquest.io/blog/pandas-big-data/

In [2]:
import pandas as pd
import numpy as  np
from nbdev.showdoc import *

Create a DataFrame from any CSV for demo.

In [3]:
df = pd.read_csv('/home/condor/datasets/mlb-games/dataquest-mlb-game-logs/game_logs.csv',low_memory=False)

In [4]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 861.6 MB


In [5]:
#export
def df_meminfo(df):
    """Prints an Average and a Total memory usage for each dtype in `df`."""
    dtypes = df.dtypes.unique().tolist()
    ttl = 0
    print("Memory usage for DataFrames types of columns:\n\t\t\t  Average\t    Total")
    for dtype in dtypes:
        selected_dtype = df.select_dtypes(include=[dtype])
        mean_usage_mb = selected_dtype.memory_usage(deep=True).mean()/ 1024 ** 2
        sum_usage_mb  = selected_dtype.memory_usage(deep=True).sum() / 1024 ** 2
        ttl += sum_usage_mb
        print("{}:  \t\t{:6.2f} MB\t{:6.2f} MB".format(dtype,mean_usage_mb,sum_usage_mb))
    print("Total for DataFrame:\t\t\t{:6.2f} MB".format(ttl))

In [6]:
df_meminfo(df)

Memory usage for DataFrames types of columns:
			  Average	    Total
int64:  		  1.12 MB	  7.87 MB
object:  		  9.53 MB	752.74 MB
float64:  		  1.29 MB	100.99 MB
Total for DataFrame:			861.60 MB


In [7]:
show_doc(df_meminfo)

<h4 id="df_meminfo" class="doc_header"><code>df_meminfo</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>df_meminfo</code>(**`df`**)

Prints an Average and a Total memory usage for each dtype in `df`.

In [8]:
#export
def mem_usage(pandas_obj):
    """Returns memory in MB, used by `pandas_obj` (DataFrame or Series)."""
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [9]:
show_doc(mem_usage)

<h4 id="mem_usage" class="doc_header"><code>mem_usage</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>mem_usage</code>(**`pandas_obj`**)

Returns memory in MB, used by `pandas_obj` (DataFrame or Series).

In [10]:
print(mem_usage(df['day_of_week']))

9.84 MB


In [11]:
df_int = df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(df_int))
print(mem_usage(converted_int))
compare_ints = pd.concat([df_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

7.87 MB
1.48 MB


Unnamed: 0,before,after
uint8,,5.0
uint32,,1.0
int64,6.0,


In [12]:
df_fl = df.select_dtypes(include=['float'])
converted_fl = df_fl.apply(pd.to_numeric,downcast='float')
print(mem_usage(df_fl))
print(mem_usage(converted_fl))
compare_fls = pd.concat([df_fl.dtypes,converted_fl.dtypes],axis=1)
compare_fls.columns = ['before','after']
compare_fls.apply(pd.Series.value_counts)

100.99 MB
50.49 MB


Unnamed: 0,before,after
float32,,77.0
float64,77.0,


In [13]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_mnist.ipynb.
Converted 02_pandas.ipynb.
Converted 99_index.ipynb.
Converted Image_Classification_CIFAR_10.ipynb.
Converted Image_Classification_CIFAR_10_Resnet101.ipynb.
Converted Image_Classification_CIFAR_10_Resnet34.ipynb.
Converted Image_Classification_Imagenette.ipynb.
Converted Image_Classification_Imagewoof.ipynb.
Converted MyMNIST_Pytorch.ipynb.
Converted Transforms.ipynb.
Converted Untitled.ipynb.
Converted Untitled1.ipynb.
Converted Untitled2.ipynb.
Converted Untitled3.ipynb.
