In [5]:
# 고성능 Pandas: eval(), query()

# query()와 eval()의 등장 배경: 복합 표현식
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)
%timeit x + y

# 복합 표현식 계산 -> 떨어진 효율성
%timeit np.fromiter((xi + yi for xi, yi in zip(x,y)), dtype=x.dtype, count=len(x))

3.08 ms ± 94.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
191 ms ± 5.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
mask = (x > 0.5) & (y < 0.5)
# tmp1 = (x > 0.5)
# tmp2 = (y < 0.5)
# mask = tmp1 & tmp2 -> 모든 중간 단계가 명시적 메모리 할당

# Numexpr -> 효율적 계산
import numexpr
mask_numexpr = numexpr.evaluate("(x > 0.5) & (y < 0.5)")
np.allclose(mask, mask_numexpr) # return True if all elements are equal

True

In [16]:
# 효율적인 연산, pandas.eval()
import pandas as pd
nrows, ncols = 1000000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
                      for i in range(4))

# 전형적인 Pandas 방식
%timeit df1 + df2 + df3 + df4

1.16 s ± 17.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
# pd.eval(expr)
%timeit pd.eval("df1 + df2 + df3 + df4") # same result, 50% faster

465 ms ± 17.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
np.allclose(df1 + df2 + df3 + df4, pd.eval("df1 + df2 + df3 + df4"))

True

In [24]:
# pd.eval() 지원 연산
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100,3)))
                           for i in range(5))

In [32]:
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval("-df1 * df2 / (df3 + df4) - df5") # 산술 연산자
print(np.allclose(result1, result2), "\n")

result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval("df1 < df2 <= df3 != df4") # 연쇄 표현식
print(np.allclose(result1, result2), "\n")

result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval("(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)") # 비트 단위 연산자
print(np.allclose(result1, result2), "\n")

result3 = pd.eval("(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)") # 부울 표현식 and/or
print(np.allclose(result1, result3), "\n")

True 

True 

True 

True 



In [47]:
# pd.eval -> obj.attr(객체 속성), obj[index](인덱스)
result1 = df2.T[0] + df3.iloc[1] # Transpose, iloc -> Series addition
result2 = pd.eval("df2.T[0] + df3.iloc[1]")
np.allclose(result1, result2)

True

In [62]:
# 열 단위 연산을 위한 DataFrame.eval()
df = pd.DataFrame(rng.rand(1000, 3), columns=["A","B","C"])
df.head()

Unnamed: 0,A,B,C
0,0.364333,0.677048,0.245342
1,0.151569,0.304225,0.214416
2,0.978796,0.642858,0.723475
3,0.128616,0.610127,0.941799
4,0.516463,0.980759,0.306461


In [63]:
result1 = (df["A"] + df["B"]) / (df["C"] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [64]:
# DataFrame.eval() -> 열 이름으로 표현
result3 = df.eval("(A+B)/(C-1)")
np.allclose(result1, result3)

True

In [65]:
# df.eval() 열 할당
print(df.head(), "\n")

df.eval("D = (A + B) / C", inplace=True)
# ORdf["D"] = df.eval("(A + B) / C")

df.head()

          A         B         C
0  0.364333  0.677048  0.245342
1  0.151569  0.304225  0.214416
2  0.978796  0.642858  0.723475
3  0.128616  0.610127  0.941799
4  0.516463  0.980759  0.306461 



Unnamed: 0,A,B,C,D
0,0.364333,0.677048,0.245342,4.244606
1,0.151569,0.304225,0.214416,2.125743
2,0.978796,0.642858,0.723475,2.241478
3,0.128616,0.610127,0.941799,0.784395
4,0.516463,0.980759,0.306461,4.88553


In [66]:
# df.eval() -> 기존 열 수정
df.eval("D = (A - B) / C", inplace=True)
# ORdf["D"] = df.eval("(A + B) / C")

df.head()

Unnamed: 0,A,B,C,D
0,0.364333,0.677048,0.245342,-1.274606
1,0.151569,0.304225,0.214416,-0.711958
2,0.978796,0.642858,0.723475,0.46434
3,0.128616,0.610127,0.941799,-0.511268
4,0.516463,0.980759,0.306461,-1.515028


In [73]:
# df.eval() 지역변수
column_mean = df.mean(1) # mean of all columns
result1 = df["A"] + column_mean
result2 = df.eval("A + @column_mean") # @ -> call for variable (namespace)
np.allclose(result1, result2)

True

In [83]:
# DataFrame.query() method
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval("df[(df.A < 0.5) & (df.B < 0.5)]")
np.allclose(result1, result2)

True

In [84]:
# df.query(expr)
result2 = df.query("A < 0.5 and B < 0.5")
print(np.allclose(result1, result2))

# .query() 지역변수 (@)
Cmean = df["C"].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query("A < @Cmean and B < @Cmean")
print(np.allclose(result1, result2))

True
True


In [87]:
# df.query() = 성능

x = df[(df.A < 0.5) & (df.B < 0.5)]

# is equal to
# tmp1 = df.A < 0.5
# tmp2 = df.B < 0.5
# tmp3 = tmp1 & tmp2
# x = df[tmp3]
# 임시 df 크기가 시스템 메모리에 비해 상당히 크다면 -> eval(), query()

df.values.nbytes
# eval(), query() -> 메모리 절약, 깔끔한 구문

32000