# High perfomance pandas/python

## pd.eval()

useful tool for high perfomance usage.

In [1]:
import numpy as np
import pandas as pd


In [5]:
nrows, ncols = int(1e6), int(1e2)

df1, df2, df3, df4 = [pd.DataFrame(
    np.random.randn(nrows,ncols)) for _ in range(4)
]

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 100 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   0       1000000 non-null  float64
 1   1       1000000 non-null  float64
 2   2       1000000 non-null  float64
 3   3       1000000 non-null  float64
 4   4       1000000 non-null  float64
 5   5       1000000 non-null  float64
 6   6       1000000 non-null  float64
 7   7       1000000 non-null  float64
 8   8       1000000 non-null  float64
 9   9       1000000 non-null  float64
 10  10      1000000 non-null  float64
 11  11      1000000 non-null  float64
 12  12      1000000 non-null  float64
 13  13      1000000 non-null  float64
 14  14      1000000 non-null  float64
 15  15      1000000 non-null  float64
 16  16      1000000 non-null  float64
 17  17      1000000 non-null  float64
 18  18      1000000 non-null  float64
 19  19      1000000 non-null  float64
 20  20      1000000 non-null

In [6]:
%timeit df1 + df2 + df3 + df4
# 11 sec, 1.4sec per loop

1.4 s ± 28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit pd.eval("df1 + df2 + df3 + df4")
# 5 sec, 0.7s per loop, about half!

679 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
# Are they the same?
plain = df1 + df2 + df3 + df4
sum_eval = pd.eval("df1 + df2 + df3 + df4")

sum_eval.equals(plain)

True

In [10]:
# df.eval()
rolls =  pd.DataFrame(np.random.randint(1,6, (6,3)), columns= ["Die1", "Die2", "Die3"])
rolls.eval("Sum = Die1 + Die2 + Die3", inplace=True)
rolls


Unnamed: 0,Die1,Die2,Die3,Sum
0,4,3,4,11
1,2,1,1,4
2,5,1,1,7
3,5,1,2,8
4,2,2,2,6
5,5,3,2,10


In [12]:
# Use variables, use @ to reach local variables
high = 9
rolls.eval("Winner = Sum > @high", inplace=True)
rolls

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
0,4,3,4,11,True
1,2,1,1,4,False
2,5,1,1,7,False
3,5,1,2,8,False
4,2,2,2,6,False
5,5,3,2,10,True


In [14]:
# Extract the loosers
# Classicaly
rolls[rolls["Sum"] < high]

# Useful for small data sets and with a few expressions, 
# it gets long if there are more limits

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
1,2,1,1,4,False
2,5,1,1,7,False
3,5,1,2,8,False
4,2,2,2,6,False


## Query()

Useful for several expressions! It's just to add & between the different expressions.

In [15]:
# The query-method
rolls.query("Sum <= @high")



Unnamed: 0,Die1,Die2,Die3,Sum,Winner
1,2,1,1,4,False
2,5,1,1,7,False
3,5,1,2,8,False
4,2,2,2,6,False


In [4]:
df_os = pd.read_csv("../../Databehandling-OS-Yuna-Joachim/data/athlete_events.csv")
df_os.head()


Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [6]:
# Tests a few things
%timeit df_os[df_os["NOC"] ==  "SWE"]
%timeit df_os.query("NOC == 'SWE'") #Note '' on SWE since it's a string in a string

# Much faster with query, 12ms instead of 21.


21 ms ± 357 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
12 ms ± 148 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
%timeit df_os[df_os["Height"] > 180]
%timeit df_os.query("Height > 180") # No '' here because it's a variable

# Here query is slower! Classic method for few queries, query for many expressions

14.6 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
21.1 ms ± 811 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%timeit df_os[(df_os["Sex"] > "F") & (df_os["Height"] > 180) & (df_os["NOC"] == "SWE")]
%timeit df_os.query("Sex == 'F' & Height > 180 & NOC == 'SWE'")

# Now  query is faster! Both faster and nicer to write.

36.1 ms ± 600 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
19.5 ms ± 292 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
