# High performance pandas

```py
mask = (x > .5) & (y < .5)
df[df[mask]]
```

under the hood

```py
tmp1 = (x > .5)
tmp2 = (y < .5)
mask = tmp1 & tmp2
```

use `pd.eval("")`, `df.query()` -> does elementwise operation with numexpr (builtin eval is dangerous with user input because user can access os?)

In [3]:
import numpy as np
import pandas as pd

nrows, ncols = 1_000_000, 100

df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-1.240002,0.221348,-0.191209,-1.277265,0.802772,1.196568,0.842733,1.226635,-1.273496,-0.039136,...,-0.478376,-0.118552,-0.80118,0.372092,-1.228955,1.644077,1.246179,1.293098,0.442528,-0.490947
1,1.257334,0.006287,-0.527977,1.438875,2.764666,0.813872,-0.618175,1.855524,-0.074325,0.21936,...,-0.304925,0.523838,0.263831,-0.760431,0.537971,-1.331695,0.145356,2.809021,0.794022,-1.54384
2,0.327551,0.170491,0.74271,1.17176,0.620171,0.008158,0.417658,0.477438,-0.656783,-1.016712,...,-0.654209,1.103377,-0.729743,0.503948,1.071096,-1.410962,0.520003,2.101902,-0.066119,0.277942
3,-0.626331,0.534974,1.16515,-0.032794,-0.874294,-0.510081,0.296605,-1.069105,-0.436282,0.118224,...,0.365723,-1.196706,0.311539,0.079013,0.208807,-0.605344,0.090398,-1.402895,-0.684906,0.054351
4,0.420122,0.425217,-1.324327,1.594111,0.297435,-0.301618,-0.048823,0.250451,-0.215314,0.004802,...,-0.427964,0.71293,-0.007842,-1.329134,0.695227,1.015195,-0.11461,-0.769926,0.883264,0.683139


In [5]:
df1.shape

(1000000, 100)

In [6]:
df1.info() # memory usage 762 MB!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 100 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   0       1000000 non-null  float64
 1   1       1000000 non-null  float64
 2   2       1000000 non-null  float64
 3   3       1000000 non-null  float64
 4   4       1000000 non-null  float64
 5   5       1000000 non-null  float64
 6   6       1000000 non-null  float64
 7   7       1000000 non-null  float64
 8   8       1000000 non-null  float64
 9   9       1000000 non-null  float64
 10  10      1000000 non-null  float64
 11  11      1000000 non-null  float64
 12  12      1000000 non-null  float64
 13  13      1000000 non-null  float64
 14  14      1000000 non-null  float64
 15  15      1000000 non-null  float64
 16  16      1000000 non-null  float64
 17  17      1000000 non-null  float64
 18  18      1000000 non-null  float64
 19  19      1000000 non-null  float64
 20  20      1000000 non-null

In [8]:
%timeit df1+df2+df3+df4
%timeit pd.eval("df1+df2+df3+df4") # eval() and query() saves a lot of time in big data sets

1.05 s ± 32.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
414 ms ± 2.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
standard = df1+df2+df3+df4
sum_eval = pd.eval("df1+df2+df3+df4")

sum_eval.equals(standard)

True

In [11]:
rolls = pd.DataFrame(np.random.randint(1, 6, (6,3)), columns = ["Die1", "Die2", "Die3"])
rolls

Unnamed: 0,Die1,Die2,Die3
0,3,3,5
1,2,3,1
2,1,5,5
3,1,1,2
4,3,4,5
5,5,2,4


In [13]:
rolls.eval("Sum = Die1 + Die2 + Die3", inplace = True)
rolls

Unnamed: 0,Die1,Die2,Die3,Sum
0,3,3,5,11
1,2,3,1,6
2,1,5,5,11
3,1,1,2,4
4,3,4,5,12
5,5,2,4,11


In [20]:
high = 9
rolls[rolls["Sum"] > high] # normal way: but returns df
rolls.eval("Sum > @high")  # @ uses variable, otherwise uses column name: returns bool series
rolls.eval("Winner = Sum > @high")  # returns df with new column

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
0,3,3,5,11,True
1,2,3,1,6,False
2,1,5,5,11,True
3,1,1,2,4,False
4,3,4,5,12,True
5,5,2,4,11,True


## Query

- filter using query

In [21]:
rolls.query("Sum > @high") # returns df without having to make new column

Unnamed: 0,Die1,Die2,Die3,Sum
0,3,3,5,11
2,1,5,5,11
4,3,4,5,12
5,5,2,4,11


In [22]:
df_os = pd.read_csv("../Data/athlete_events.csv")

In [26]:
df_os[df_os["NOC"] == "SWE"]
df_os.query("NOC == 'SWE'").head() # equivalent

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
725,414,Arvid berg,M,26.0,,,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Hammer Throw,
726,415,Bjrn Olof Conny berg,M,23.0,181.0,76.0,Sweden,SWE,1992 Winter,1992,Winter,Albertville,Freestyle Skiing,Freestyle Skiing Men's Moguls,
727,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Long Jump,Bronze
728,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Triple Jump,Silver
729,417,Sara Helena berg,F,17.0,190.0,73.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Swimming,Swimming Women's 50 metres Freestyle,


In [27]:
%timeit df_os[df_os["NOC"] == "SWE"]
%timeit df_os.query("NOC == 'SWE'").head() # faster

16.8 ms ± 516 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
9.5 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
# query is mostly faster when you have several conditions
%timeit df_os.query("Height > 180") # slower 
%timeit df_os[df_os["Height"] > 180]

14.3 ms ± 576 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
11.2 ms ± 469 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [35]:
%timeit df_os[(df_os["Sex"] == "F") & (df_os["Height"] > 180) & (df_os["NOC"] == "SWE")] 
%timeit df_os.query("Sex == 'F' & Height > 180 & NOC == 'SWE'") # much faster

31.9 ms ± 3.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
14.5 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# enhancing performance:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html