<h1>3장 Pandas로 데이터 가공하기</h1>

<h2>고성능 Pandas: eval()과 query()</h2>

### query()와 eval()의 등장 배경: 복합 표현식

In [1]:
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(100000)
y = rng.rand(100000)
%timeit x + y

66.4 µs ± 821 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [2]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x,y)), dtype = x.dtype, count=len(x))

18.9 ms ± 341 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
mask = (x > 0.5) & (y < 0.5)

In [15]:
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2

In [16]:
import numexpr
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

True

### 효율적인 연산을 위한 pandas.eval()

In [17]:
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))

In [18]:
%timeit df1 + df2 + df3 + df4

80.8 ms ± 4.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
%timeit pd.eval('df1 + df2 + df3 + df4')

38.7 ms ± 176 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

#### pd.eval()이 지원하는 연산

In [31]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i  in range(5))

In [35]:
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

In [36]:
result3 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
reuslt4 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result3, reuslt4)

True

In [37]:
result5 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result6 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result5, result6)

True

In [41]:
result7 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result5, result7)

True

In [42]:
result8 = df2.T[0] + df3.iloc[1]
result9 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result8, result9)

True

### 열 단위의 연산을 위한 DataFrame.eval()

In [43]:
df = pd.DataFrame(rng.rand(1000, 3), columns=['A','B','C'])
df.head()

Unnamed: 0,A,B,C
0,0.061761,0.925463,0.99742
1,0.209863,0.280456,0.042148
2,0.738991,0.019046,0.715501
3,0.062857,0.516241,0.604588
4,0.204537,0.813392,0.244804


In [44]:
result1 = (df['A'] + df['B']) / (df['C']-1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [45]:
result3 = df.eval('(A+B)/(C-1)')
np.allclose(result1, result3)

True

#### DataFrame.eval()에서의 할당

In [46]:
df.head()

Unnamed: 0,A,B,C
0,0.061761,0.925463,0.99742
1,0.209863,0.280456,0.042148
2,0.738991,0.019046,0.715501
3,0.062857,0.516241,0.604588
4,0.204537,0.813392,0.244804


In [47]:
df.eval('D = (A + B) / C', inplace = True)
df.head()

Unnamed: 0,A,B,C,D
0,0.061761,0.925463,0.99742,0.989777
1,0.209863,0.280456,0.042148,11.633339
2,0.738991,0.019046,0.715501,1.05945
3,0.062857,0.516241,0.604588,0.95784
4,0.204537,0.813392,0.244804,4.158143


In [48]:
df.eval('D = (A - B) / C', inplace = True)
df.head()

Unnamed: 0,A,B,C,D
0,0.061761,0.925463,0.99742,-0.865935
1,0.209863,0.280456,0.042148,-1.674903
2,0.738991,0.019046,0.715501,1.00621
3,0.062857,0.516241,0.604588,-0.749906
4,0.204537,0.813392,0.244804,-2.487117


#### DataFrame.eval()의 지역 변수

In [50]:
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

#### DataFrame.query() 메서드

In [53]:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [54]:
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)

True

In [55]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True

### 성능: 이 함수를 사용해야 하는 경우

In [56]:
x = df[(df.A < 0.5) & (df.B < 0.5)]

In [57]:
tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2 
x = df[tmp3]

In [58]:
df.values.nbytes

32000