# Review: Pandas DF vs Numpy Arrays / Structured Arrays

In [1]:
import numpy as np
import pandas as pd
import sys
from numpy.lib.recfunctions import unstructured_to_structured

### functions

In [44]:
def get_size(variable):
    if isinstance(variable, pd.DataFrame):
        size = sys.getsizeof(variable)
    else:
        size = variable.nbytes
    if size > 1_000_000:
        size = size / 1_000_000
        units = 'Megabytes'
    elif size > 1_000:
        size = size / 1_000
        units = 'Kilobytes'
    else:
        units = 'Bytes'
    print(f"{size} {units} and shape {variable.shape}.")


def to_structured(arr, dtype):
    return unstructured_to_structured(arr, dtype=dtype)
    

### review

In [45]:
example_size = (10_000_000, 3)

In [46]:
# Numpy Array large data example
a_data = np.ones(example_size)
get_size(a_data)

240.0 Megabytes and shape (10000000, 3).


In [47]:
# Structured Array
sa_data = to_structured(a_data, np.dtype([('c1', np.float64), ('c2', np.float64), ('c3', np.float64)]))
get_size(sa_data)

240.0 Megabytes and shape (10000000,).


In [48]:
# Pandas
df = pd.DataFrame(a_data, columns = ["c1", "c2", "c3"])
get_size(df)

240.000152 Megabytes and shape (10000000, 3).


### save

In [7]:
np.save("file", a_data)

In [10]:
np.save("file2", sa_data)

In [8]:
np.savetxt("file.csv", a_data, delimiter=",")

In [9]:
df.to_csv("file2.csv")

### load

In [20]:
a_data2 = np.load("file.npy")
get_size(a_data2)

240.00012 Megabytes and shape (10000000, 3).


In [21]:
sa_data2 = np.load("file2.npy")
get_size(sa_data2)

240.000104 Megabytes and shape (10000000,).


In [25]:
np.genfromtxt("file.csv", delimiter=',')

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       ...,
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [26]:
df2 = pd.read_csv("file2.csv")

### operations / sum

In [29]:
a_data[:,0].sum()

10000000.0

In [30]:
sa_data["c1"].sum()

10000000.0

In [31]:
df["c1"].sum()

10000000.0

### downgrade dtypes

In [59]:
for c in df.columns:
    df[c] = df[c].astype(np.float16)
get_size(df)

60.000152 Megabytes and shape (10000000, 3).


In [60]:
df.dtypes

c1    float16
c2    float16
c3    float16
dtype: object

In [62]:
df.to_csv("file3.csv")

### save Pandas DF as Structured Array npz

In [75]:
records = df.to_records(index=False)
data = np.array(records, dtype = records.dtype.descr)
np.save("file3", data)

```
-rw-r--r--   1 juan  staff   715M Oct 15 00:19 file.csv
-rw-r--r--   1 juan  staff   229M Oct 15 00:18 file.npy
-rw-r--r--   1 juan  staff   190M Oct 15 00:19 file2.csv
-rw-r--r--   1 juan  staff   229M Oct 15 00:19 file2.npy
-rw-r--r--   1 juan  staff   190M Oct 15 00:19 file3.csv
-rw-r--r--   1 juan  staff    57M Oct 15 01:23 file3.npy
```

In [79]:
df3 = pd.DataFrame(np.load("file3.npy"))
get_size(df3)

60.000152 Megabytes and shape (10000000, 3).


# Conclusions

- Para el uso normal usar Pandas tiene mas funcionalidades utiles que Numpy donde se destaca el GroupBy.
- Para datasets smaller el mejor performance es de Numpy pero no por mucho. Para datasets lagers es al contrario.
- Para guardar grandes cantidades de informacion, lo mejor es usar npy/npz (Numpy format). Para no perder header, se puede usar Structured Numpy arrays.