# Try CoW in Pandas 2.1

* https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html
* https://towardsdatascience.com/deep-dive-into-pandas-copy-on-write-mode-part-i-26982e7408c6
* https://towardsdatascience.com/deep-dive-into-pandas-copy-on-write-mode-part-ii-b023432a5334


Making .to_numpy() read only seems like a sensible default

Can try rename columns, assign new col, drop col, reset index, set index

In [1]:
import pandas as pd
#%load_ext autoreload
#%autoreload 2

import datetime, os
display(f"Pandas {pd.__version__}")
display(f'Running: {datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")}, env {os.getenv("CONDA_DEFAULT_ENV")}')


%load_ext ipython_memory_usage
%imu_start

'Pandas 2.1.3'

'Running: 20231203 10:54:34, env pydataglobal2023'

Enabling IPython Memory Usage, use %imu_start to begin, %imu_stop to end


'IPython Memory Usage started'

In [1] used 0.2 MiB RAM in 0.11s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 134.8 MiB


In [2]:
pd.options.mode.copy_on_write = True
print(f"Copy on Write enabled? {pd.options.mode.copy_on_write}")
import pickle

# first we need to write the pickled file, before we try to use it
# else we get odd memory cleanups!
if not os.path.exists('/tmp/test_result.pickle'):
    print("MAKING TMP FILE")
    dfpdn = pd.read_parquet("../test_result_2021on.parquet", 
                            dtype_backend="numpy_nullable")
    # cow nocow 23s +19.8GB
    with open('/tmp/test_result.pickle', 'wb') as f:
        pickle.dump(dfpdn, f)
    print("Wrote to /tmp")
    1/0 # deliberate crash
else:
    print("Reading from /tmp")
    # if file missing, create it with the line above
    # and then restart the kernel
    with open('/tmp/test_result.pickle', 'rb') as f:
        dfpdn = pickle.load(f)


Copy on Write enabled? True
Reading from /tmp
In [2] used 9156.6 MiB RAM in 11.82s (system mean cpu 13%, single max cpu 100%), peaked 4254.2 MiB above final usage, current RAM usage now 9291.5 MiB


In [None]:
#import gc
#gc.collect() # important when using parquet file, not if using pickled

In [3]:
df2 = dfpdn.rename(columns={'make': 'car_make'}). \
            assign(is_petrol = dfpdn['fuel_type'] == 'PE'). \
            drop(columns=['fuel_type'])
# (nocow -120MB, peaked 9.7GB, 20s if using parquet)
# nocow 19s, +8.6GB, peak +9.6GB
# cow +70MB 4s no peak

In [3] used 79.6 MiB RAM in 3.36s (system mean cpu 11%, single max cpu 67%), peaked 0.0 MiB above final usage, current RAM usage now 9371.1 MiB


In [4]:
assert df2['is_petrol'][0] == False

In [4] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 9371.1 MiB


In [None]:
df2['is_petrol'][0] = False # results in a ChainedAssignmentError

In [5]:
df2.loc[0, 'test_mileage'] = 99  # this did result in a copy occuring!
assert df2.loc[0, 'test_mileage'] == 99
assert dfpdn.loc[0, 'test_mileage'] == 227219

In [5] used 1329.6 MiB RAM in 0.40s (system mean cpu 15%, single max cpu 100%), peaked 0.0 MiB above final usage, current RAM usage now 10700.7 MiB


In [None]:
dfpdn.head(3)

In [6]:
df2.loc[0, 'is_petrol'] = True # cow works, doesn't need to copy

In [6] used 0.0 MiB RAM in 0.10s (system mean cpu 0%, single max cpu 0%), peaked 0.0 MiB above final usage, current RAM usage now 10700.7 MiB


In [None]:
arr = df2[['is_petrol']].to_numpy()
arr.data.readonly, arr.shape
# cow readonly true
# nocow readonly false

In [None]:
arr[0,:] = 0 # with cow will crash ValueError: assignment destination is read-only


In [None]:
#arr.data.readonly = False # can't change
arr.flags.writeable = True
arr[0,:] = 0

In [None]:
df2 = dfpdn.rename(columns={'make': 'car_make'}). \
    assign(is_petrol = dfpdn['fuel_type'] == 'PE')
# nocow 0.4GB +15s
# cow -8.7GB +4s, i don't know how this can save ram!

In [None]:
df2 = dfpdn.set_index('test_date'). \
    rename(columns={'make': 'car_make'}). \
    assign(is_petrol = dfpdn['fuel_type'] == 'PE')
# nocow 31s +1GB
# cow 5s -9GB

# Try each item broken out

In [None]:
#df2 = dfpdn.set_index('test_date')
# nocow 5s -1.2GB
# cow 0s -0.7GB

In [None]:
df2 = dfpdn.rename(columns={'make': 'car_make'})
# nocow 5s +9GB
# cow 0s -0.8GB

In [None]:
#df4 = df3.assign(is_petrol = df3['fuel_type'] == 'PE')
# 10s +8.6GB
# 4s -8GB

In [None]:
#df2 = dfpda.reset_index(drop=True)
# nocow drop=False 5s +172MB to 700MB
# nocow drop=True 5s -1.8GB
# cow drop False 0s -900MB

In [None]:
import pandas as pd
#%load_ext autoreload
#%autoreload 2
display(f"Pandas {pd.__version__}")
%load_ext ipython_memory_usage
%imu_start

In [None]:
pd.options.mode.copy_on_write = True
print(pd.options.mode.copy_on_write)

In [None]:
import numpy as np
NBR_ONES = 100_000_000
# c1 c2 8bytes so 1.6GB overall
#df = pd.DataFrame({'c1': np.ones(NBR_ONES), 'c2': np.ones(NBR_ONES)})
df = pd.DataFrame(np.ones((NBR_ONES, 2)))
df.columns = ['c1', 'c2']
df._data
# +1.5GB

In [None]:
df2 = df.reset_index(drop=True)
# nocow cow uses 1.5GB
# sometimes nocow uses 3GB
df2.head(2)

In [None]:
df.iloc[0, 0] = 1
df.iloc[0, 1] = 1
# cow seems to add +760 per column modified, which feels weird, regardless of of 2d block or 2*1d arrays

In [None]:
df3 = df2.drop(columns=['c2'])
# cow uses 0.6mb
# nocow uses 760MB (1.5GB if index wasn't dropped)
df3.head(2)

In [None]:
arr = df3.to_numpy()
# cow nocow False readonly, True writeable (varies by 1d or 2d)
# 1d cow True ro, nocow False ro
print(arr.data.readonly, arr.flags.writeable)
arr

In [None]:
#
#df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]})
#display(df._data) # generates 2 blocks
#arr = df.to_numpy()
#print(arr.data.readonly, arr.flags.writeable)
#arr[0, 0] = 100
# nocow cow succeeds, df not modified, arr modified
#df, arr

In [None]:

#df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
#display(df._data) # generates 1 block
#arr = df.to_numpy()
#print(arr.data.readonly, arr.flags.writeable)
#arr[0, 0] = 100
## cow fails to write (True ro)
## nocow succeeds (False ro)
#df, arr

In [None]:
1/0

In [None]:
df3.columns

In [None]:
dfpda = pd.read_parquet("../test_result_2021on.parquet", 
                        dtype_backend="pyarrow")
pd.options.mode.copy_on_write = False
print(pd.options.mode.copy_on_write)

In [None]:
dfx2 = dfpda.reset_index(drop=True)
dfx2['test_class_id'] = 1

In [None]:
dfx2 = dfx2.assign(test_type_4 = dfx2['test_class_id'] == 4)
dfx2.head(2)