## 데이터 연산

In [None]:
import pandas as pd
import numpy as np

In [None]:
s = pd.Series(np.random.randint(0, 10, 5))
s

0    8
1    8
2    0
3    7
4    2
dtype: int64

In [None]:
df = pd.DataFrame(np.random.randint(0, 10, (3, 3)),
                  columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,6,6,7
1,2,0,7
2,6,7,8


In [None]:
np.exp(s)

0    2980.957987
1    2980.957987
2       1.000000
3    1096.633158
4       7.389056
dtype: float64

In [None]:
np.cos(df * np.pi / 4)

Unnamed: 0,A,B,C
0,-1.83697e-16,-1.83697e-16,0.707107
1,6.123234000000001e-17,1.0,0.707107
2,-1.83697e-16,0.7071068,1.0


In [None]:
# index 기준으로 연산됨. index가 없는 것은 NaN으로 출력
s1 = pd.Series([1, 3, 5, 7, 9], index=[0, 1, 2, 3, 4])
s2 = pd.Series([2, 4, 6, 8, 10], index=[1, 2, 3, 4, 5])
s1 + s2

0     NaN
1     5.0
2     9.0
3    13.0
4    17.0
5     NaN
dtype: float64

In [None]:
# index가 없는 값 0으로 채우고 계산
s1.add(s2, fill_value=0)

0     1.0
1     5.0
2     9.0
3    13.0
4    17.0
5    10.0
dtype: float64

In [None]:
df1 = pd.DataFrame(np.random.randint(0, 20, (3, 3)),
                   columns=list('ACD'))
df1                   

Unnamed: 0,A,C,D
0,15,9,1
1,19,2,16
2,0,17,9


In [None]:
df2 = pd.DataFrame(np.random.randint(0, 20, (5, 5)),
                   columns=list('BAECD'))
df2              

Unnamed: 0,B,A,E,C,D
0,17,9,13,15,0
1,3,11,3,10,7
2,2,2,14,11,16
3,11,2,10,4,11
4,5,13,16,15,14


In [None]:
df1 + df2

Unnamed: 0,A,B,C,D,E
0,24.0,,24.0,1.0,
1,30.0,,12.0,23.0,
2,2.0,,28.0,25.0,
3,,,,,
4,,,,,


In [None]:
fvalue = df1.stack().mean()
df1.add(df2, fill_value=fvalue)

Unnamed: 0,A,B,C,D,E
0,24.0,26.777778,24.0,1.0,22.777778
1,30.0,12.777778,12.0,23.0,12.777778
2,2.0,11.777778,28.0,25.0,23.777778
3,11.777778,20.777778,13.777778,20.777778,19.777778
4,22.777778,14.777778,24.777778,23.777778,25.777778


### 연산자 범용 함수

#### add()

In [None]:
a = np.random.randint(1, 10, size=(3, 3))
a

array([[9, 7, 5],
       [3, 8, 9],
       [9, 2, 4]])

In [None]:
a + a[0]    # 브로드캐스팅

array([[18, 14, 10],
       [12, 15, 14],
       [18,  9,  9]])

In [None]:
df = pd.DataFrame(a, columns=list('ABC'))
df

Unnamed: 0,A,B,C
0,9,7,5
1,3,8,9
2,9,2,4


In [None]:
df + df.iloc[0]

Unnamed: 0,A,B,C
0,18,14,10
1,12,15,14
2,18,9,9


In [None]:
df.add(df.iloc[0])    # 판다스가 넘파이를 기반으로 만들어져 연산자, 함수 혼용해서 사용

Unnamed: 0,A,B,C
0,18,14,10
1,12,15,14
2,18,9,9


#### sub() / subtract()

In [None]:
a

array([[9, 7, 5],
       [3, 8, 9],
       [9, 2, 4]])

In [None]:
a - a[0]

array([[ 0,  0,  0],
       [-6,  1,  4],
       [ 0, -5, -1]])

In [None]:
df

Unnamed: 0,A,B,C
0,9,7,5
1,3,8,9
2,9,2,4


In [None]:
df - df.iloc[0]

Unnamed: 0,A,B,C
0,0,0,0
1,-6,1,4
2,0,-5,-1


In [None]:
df.sub(df.iloc[0])

Unnamed: 0,A,B,C
0,0,0,0
1,-6,1,4
2,0,-5,-1


In [None]:
df.subtract(df['B'], axis=0)    # 2차원이므로 축 지정

Unnamed: 0,A,B,C
0,2,0,-2
1,-5,0,1
2,7,0,2


#### mul() / multiply()

In [None]:
a

array([[9, 7, 5],
       [3, 8, 9],
       [9, 2, 4]])

In [None]:
a * a[1]

array([[27, 56, 45],
       [ 9, 64, 81],
       [27, 16, 36]])

In [None]:
df

Unnamed: 0,A,B,C
0,9,7,5
1,3,8,9
2,9,2,4


In [None]:
df * df.iloc[1]

Unnamed: 0,A,B,C
0,27,56,45
1,9,64,81
2,27,16,36


In [None]:
df.mul(df.iloc[1])

Unnamed: 0,A,B,C
0,27,56,45
1,9,64,81
2,27,16,36


In [None]:
df.multiply(df.iloc[1])

Unnamed: 0,A,B,C
0,27,56,45
1,9,64,81
2,27,16,36


#### truediv() /  div() / divide() / floordiv()

In [None]:
a

array([[9, 7, 5],
       [3, 8, 9],
       [9, 2, 4]])

In [None]:
a / a[0]

array([[1.        , 1.        , 1.        ],
       [0.33333333, 1.14285714, 1.8       ],
       [1.        , 0.28571429, 0.8       ]])

In [None]:
df

Unnamed: 0,A,B,C
0,9,7,5
1,3,8,9
2,9,2,4


In [None]:
df / df.iloc[0]

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.333333,1.142857,1.8
2,1.0,0.285714,0.8


In [None]:
df.truediv(df.iloc[0])

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.333333,1.142857,1.8
2,1.0,0.285714,0.8


In [None]:
df.div(df.iloc[0])

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.333333,1.142857,1.8
2,1.0,0.285714,0.8


In [None]:
df.divide(df.iloc[0])

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.333333,1.142857,1.8
2,1.0,0.285714,0.8


In [None]:
a // a[0]

array([[1, 1, 1],
       [0, 1, 1],
       [1, 0, 0]])

In [None]:
df.floordiv(df.iloc[0])

Unnamed: 0,A,B,C
0,1,1,1
1,0,1,1
2,1,0,0


#### mod()

In [None]:
a

array([[9, 7, 5],
       [3, 8, 9],
       [9, 2, 4]])

In [None]:
a % a[0]

array([[0, 0, 0],
       [3, 1, 4],
       [0, 2, 4]])

In [None]:
df

Unnamed: 0,A,B,C
0,9,7,5
1,3,8,9
2,9,2,4


In [None]:
df.mod(df.iloc[0])

Unnamed: 0,A,B,C
0,0,0,0
1,3,1,4
2,0,2,4


#### pow()

In [None]:
a

array([[9, 7, 5],
       [3, 8, 9],
       [9, 2, 4]])

In [None]:
a ** a[0]

array([[387420489,    823543,      3125],
       [    19683,   2097152,     59049],
       [387420489,       128,      1024]])

In [None]:
df

Unnamed: 0,A,B,C
0,9,7,5
1,3,8,9
2,9,2,4


In [None]:
df.pow(df.iloc[0])

Unnamed: 0,A,B,C
0,387420489,823543,3125
1,19683,2097152,59049
2,387420489,128,1024


In [None]:
row = df.iloc[0, ::2]
row

A    9
C    5
Name: 0, dtype: int64

In [None]:
df - row    # 선택한 부분(A, C)만 빼기

Unnamed: 0,A,B,C
0,0.0,,0.0
1,-6.0,,4.0
2,0.0,,-1.0


### 정렬(Sort)

In [None]:
s = pd.Series(range(5), index=['A', 'D', 'B', 'C', 'E'])
s

A    0
D    1
B    2
C    3
E    4
dtype: int64

In [None]:
s.sort_index()

A    0
B    2
C    3
D    1
E    4
dtype: int64

In [None]:
s.sort_values()

A    0
D    1
B    2
C    3
E    4
dtype: int64

In [None]:
df = pd.DataFrame(np.random.randint(0, 10, (4, 4)),
                  index=[2, 4, 1, 3],
                  columns=list('BDAC'))
df

Unnamed: 0,B,D,A,C
2,7,5,0,2
4,2,5,7,9
1,0,1,1,5
3,1,5,4,0


In [None]:
df.sort_index()

Unnamed: 0,B,D,A,C
1,0,1,1,5
2,7,5,0,2
3,1,5,4,0
4,2,5,7,9


In [None]:
df.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2,0,7,2,5
4,7,2,9,5
1,1,0,5,1
3,4,1,0,5


In [None]:
df.sort_values(by='A')    # 2차원이므로 기준 정해야 함. A 컬럼 값 기준으로 정렬됨

Unnamed: 0,B,D,A,C
2,7,5,0,2
1,0,1,1,5
3,1,5,4,0
4,2,5,7,9


In [None]:
df.sort_values(by=['A','C'])

Unnamed: 0,B,D,A,C
2,7,5,0,2
1,0,1,1,5
3,1,5,4,0
4,2,5,7,9


### 순위(Ranking)

In [None]:
s = pd.Series([-2, 4, 7, 3, 0, 7, 5, -4, 2, 6])
s

0   -2
1    4
2    7
3    3
4    0
5    7
6    5
7   -4
8    2
9    6
dtype: int64

In [None]:
s.rank()    # 같은 수가 있는 경우 순위를 .5로 나눠서 나타냄

0    2.0
1    6.0
2    9.5
3    5.0
4    3.0
5    9.5
6    7.0
7    1.0
8    4.0
9    8.0
dtype: float64

In [None]:
s.rank(method='first')    # 같은 수가 있는 경우 먼저 나온 수를 상위로 랭킹

0     2.0
1     6.0
2     9.0
3     5.0
4     3.0
5    10.0
6     7.0
7     1.0
8     4.0
9     8.0
dtype: float64

In [None]:
s.rank(method='max')    # 같은 수가 있는 경우 둘 다 하위로 랭킹

0     2.0
1     6.0
2    10.0
3     5.0
4     3.0
5    10.0
6     7.0
7     1.0
8     4.0
9     8.0
dtype: float64

### 고성능 연산

In [None]:
nrows, ncols = 100000, 100
df1, df2, df3, df4 = (pd.DataFrame(np.random.rand(nrows, ncols)) for i in range(4))

In [None]:
%timeit df1 + df2 + df3 + df4

10 loops, best of 5: 59.6 ms per loop


In [None]:
%timeit pd.eval('df1 + df2 + df3 + df4')    # 빠른 속도로 연산 수행

10 loops, best of 5: 41.6 ms per loop


In [None]:
%timeit df1 * -df2 / (-df3 * df4)

10 loops, best of 5: 107 ms per loop


In [None]:
%timeit pd.eval('df1 * -df2 / (-df3 * df4)')

10 loops, best of 5: 51.2 ms per loop


In [None]:
%timeit (df1 < df2) & (df2 <= df3) & (df3 != df4)

10 loops, best of 5: 60.4 ms per loop


In [None]:
%timeit pd.eval('(df1 < df2) & (df2 <= df3) & (df3 != df4)')

10 loops, best of 5: 92.4 ms per loop


In [None]:
df = pd.DataFrame(np.random.rand(1000000, 5), columns=['A', 'B', 'C', 'D', 'E'])
df.head()

Unnamed: 0,A,B,C,D,E
0,0.377602,0.795581,0.128022,0.927095,0.952023
1,0.829837,0.904401,0.765395,0.716236,0.641818
2,0.064382,0.780903,0.074593,0.849271,0.432093
3,0.71774,0.20664,0.919457,0.098118,0.211042
4,0.361087,0.454134,0.092139,0.536303,0.547374


In [None]:
%timeit df['A'] + df['B'] / df['C'] -df['D'] * df['E']

10 loops, best of 5: 20.7 ms per loop


In [None]:
%timeit pd.eval('df.A + df.B / df.C -df.D * df.E')

100 loops, best of 5: 8.33 ms per loop


In [None]:
%timeit(df.eval('A + B / C - D * E'))

100 loops, best of 5: 13 ms per loop


In [None]:
df.eval('R = A + B / C - D * E', inplace=True)
df.head()

Unnamed: 0,A,B,C,D,E,R
0,0.377602,0.795581,0.128022,0.927095,0.952023,5.709385
1,0.829837,0.904401,0.765395,0.716236,0.641818,1.551758
2,0.064382,0.780903,0.074593,0.849271,0.432093,10.166288
3,0.71774,0.20664,0.919457,0.098118,0.211042,0.921775
4,0.361087,0.454134,0.092139,0.536303,0.547374,4.996297


In [None]:
df.eval('R = A - B / C + D * E', inplace=True)
df.head()

Unnamed: 0,A,B,C,D,E,R
0,0.377602,0.795581,0.128022,0.927095,0.952023,-4.954182
1,0.829837,0.904401,0.765395,0.716236,0.641818,0.107916
2,0.064382,0.780903,0.074593,0.849271,0.432093,-10.037523
3,0.71774,0.20664,0.919457,0.098118,0.211042,0.513705
4,0.361087,0.454134,0.092139,0.536303,0.547374,-4.274124


In [None]:
col_mean = df.mean(1)
df['A'] + col_mean

0         0.081958
1         1.490771
2        -1.241664
3         1.162190
4        -0.019428
            ...   
999995    0.670544
999996    1.258830
999997    1.343470
999998    1.095294
999999    1.514238
Length: 1000000, dtype: float64

In [None]:
df.eval('A + @col_mean')    # @으로 외부 변수에 접근

0         0.081958
1         1.490771
2        -1.241664
3         1.162190
4        -0.019428
            ...   
999995    0.670544
999996    1.258830
999997    1.343470
999998    1.095294
999999    1.514238
Length: 1000000, dtype: float64

In [None]:
df[(df.A < 0.5) & (df.B < 0.5) & (df.C > 0.5)]

Unnamed: 0,A,B,C,D,E,R
5,0.224065,0.122682,0.654769,0.763448,0.493707,0.413618
12,0.040334,0.413161,0.975840,0.418697,0.053922,-0.360480
14,0.391268,0.228132,0.523002,0.938175,0.188566,0.131978
16,0.179296,0.448222,0.893680,0.288426,0.678406,-0.126580
17,0.401679,0.286511,0.661606,0.637057,0.976375,0.590632
...,...,...,...,...,...,...
999987,0.280954,0.405498,0.824981,0.776644,0.563515,0.227081
999991,0.487447,0.137172,0.724615,0.050704,0.550480,0.326056
999992,0.209815,0.411734,0.544573,0.441876,0.267326,-0.428129
999995,0.362032,0.013831,0.769276,0.144247,0.190199,0.371488


In [None]:
pd.eval('df[(df.A < 0.5) & (df.B < 0.5) & (df.C > 0.5)]')

Unnamed: 0,A,B,C,D,E,R
5,0.224065,0.122682,0.654769,0.763448,0.493707,0.413618
12,0.040334,0.413161,0.975840,0.418697,0.053922,-0.360480
14,0.391268,0.228132,0.523002,0.938175,0.188566,0.131978
16,0.179296,0.448222,0.893680,0.288426,0.678406,-0.126580
17,0.401679,0.286511,0.661606,0.637057,0.976375,0.590632
...,...,...,...,...,...,...
999987,0.280954,0.405498,0.824981,0.776644,0.563515,0.227081
999991,0.487447,0.137172,0.724615,0.050704,0.550480,0.326056
999992,0.209815,0.411734,0.544573,0.441876,0.267326,-0.428129
999995,0.362032,0.013831,0.769276,0.144247,0.190199,0.371488


In [None]:
df.query('A < 0.5 and B < 0.5 and C > 0.5')

Unnamed: 0,A,B,C,D,E,R
5,0.224065,0.122682,0.654769,0.763448,0.493707,0.413618
12,0.040334,0.413161,0.975840,0.418697,0.053922,-0.360480
14,0.391268,0.228132,0.523002,0.938175,0.188566,0.131978
16,0.179296,0.448222,0.893680,0.288426,0.678406,-0.126580
17,0.401679,0.286511,0.661606,0.637057,0.976375,0.590632
...,...,...,...,...,...,...
999987,0.280954,0.405498,0.824981,0.776644,0.563515,0.227081
999991,0.487447,0.137172,0.724615,0.050704,0.550480,0.326056
999992,0.209815,0.411734,0.544573,0.441876,0.267326,-0.428129
999995,0.362032,0.013831,0.769276,0.144247,0.190199,0.371488


In [None]:
col_mean = df['D'].mean()
df[(df.A < col_mean) &  (df.B < col_mean)]

Unnamed: 0,A,B,C,D,E,R
4,0.361087,0.454134,0.092139,0.536303,0.547374,-4.274124
5,0.224065,0.122682,0.654769,0.763448,0.493707,0.413618
12,0.040334,0.413161,0.975840,0.418697,0.053922,-0.360480
13,0.422618,0.009301,0.062506,0.615288,0.672384,0.687520
14,0.391268,0.228132,0.523002,0.938175,0.188566,0.131978
...,...,...,...,...,...,...
999990,0.151054,0.247723,0.161361,0.243156,0.316683,-1.307156
999991,0.487447,0.137172,0.724615,0.050704,0.550480,0.326056
999992,0.209815,0.411734,0.544573,0.441876,0.267326,-0.428129
999995,0.362032,0.013831,0.769276,0.144247,0.190199,0.371488


In [None]:
df.query('A < @col_mean and B < @col_mean')

Unnamed: 0,A,B,C,D,E,R
4,0.361087,0.454134,0.092139,0.536303,0.547374,-4.274124
5,0.224065,0.122682,0.654769,0.763448,0.493707,0.413618
12,0.040334,0.413161,0.975840,0.418697,0.053922,-0.360480
13,0.422618,0.009301,0.062506,0.615288,0.672384,0.687520
14,0.391268,0.228132,0.523002,0.938175,0.188566,0.131978
...,...,...,...,...,...,...
999990,0.151054,0.247723,0.161361,0.243156,0.316683,-1.307156
999991,0.487447,0.137172,0.724615,0.050704,0.550480,0.326056
999992,0.209815,0.411734,0.544573,0.441876,0.267326,-0.428129
999995,0.362032,0.013831,0.769276,0.144247,0.190199,0.371488
