# Ch3.2 Operating on Data in Pandas

## Ufuncs: Index Preservation

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.__version__,pd.__version__

('1.17.4', '0.25.3')

In [3]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [4]:
# element-wise
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [5]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [6]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,403.428793,8103.083928,7.389056,403.428793
1,1096.633158,54.59815,20.085537,1096.633158
2,1096.633158,7.389056,148.413159,54.59815


In [7]:
df['A']

0    6
1    7
2    7
Name: A, dtype: int32

In [8]:
np.exp(df['A'])

0     403.428793
1    1096.633158
2    1096.633158
Name: A, dtype: float64

In [9]:
df.iloc[1]

A    7
B    4
C    3
D    7
Name: 1, dtype: int32

In [10]:
df.iloc[1].index # column

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
np.exp(df.iloc[1])

A    1096.633158
B      54.598150
C      20.085537
D    1096.633158
Name: 1, dtype: float64

In [12]:
df.iloc[1].index # column

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [14]:
# np.negative(df)
-df

Unnamed: 0,A,B,C,D
0,-6,-9,-2,-6
1,-7,-4,-3,-7
2,-7,-2,-5,-4


## UFuncs: Index Alignment

### Index alignment in Series

In [15]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [16]:
area

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64

In [17]:
population

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [18]:
population / area
# The resulting array contains the union of indices of the two input arrays
# any missing values are filled in with NaN by default

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [19]:
# 判斷 population/area 是否有 NaN 
(population/area).isnull().any()

True

In [20]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

In [21]:
A

0    2
1    4
2    6
dtype: int64

In [22]:
B

1    1
2    3
3    5
dtype: int64

In [23]:
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [24]:
A.shape

(3,)

In [25]:
B.shape

(3,)

In [26]:
# 資料筆數比對
( A + B ).shape

(4,)

In [27]:
( A + B ).isnull().any()

True

explicit specification of the fill value for any elements in ``A`` or ``B`` that might be missing:

In [28]:
# 填補 NaN (fill_value=0)
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [29]:
X = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
X

Unnamed: 0,A,B
0,1,11
1,5,1


In [30]:
Y = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
Y

Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [31]:
X + Y

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [32]:
X.add(Y,fill_value=0)

Unnamed: 0,A,B,C
0,1.0,15.0,9.0
1,13.0,6.0,0.0
2,2.0,9.0,6.0


fill with the mean of all values in ``A`` (computed by first stacking the rows of ``A``):

In [33]:
X

Unnamed: 0,A,B
0,1,11
1,5,1


In [34]:
X.stack()

0  A     1
   B    11
1  A     5
   B     1
dtype: int32

In [35]:
fill = X.mean()
fill

A    3.0
B    6.0
dtype: float64

In [36]:
fill = X.stack().mean()
fill

4.5

In [37]:
fill = X.stack().mean()
X.add(Y, fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


## Ufuncs: Operations Between DataFrame and Series

In [52]:
# 1-D Numpy 二元運算 element-wise
arr0=np.array((1,5,7))
arr1=np.array((11,15,17))
arr0+arr1

array([12, 20, 24])

In [48]:
# 2-D Numpy 二元運算 element-wise
arr2=np.array([(1,5,7),(2,6,8)])
arr3=np.array([(11,15,17),(12,16,18)])
arr2+arr3

array([[12, 20, 24],
       [14, 22, 26]])

In [53]:
# Broadcast
arr1+arr3

array([[22, 30, 34],
       [23, 31, 35]])

In [50]:
ar=np.array([[1,2,3],[10,20,30]]); ar

array([[ 1,  2,  3],
       [10, 20, 30]])

In [51]:
ar[0]

array([1, 2, 3])

subtraction between a two-dimensional array and one of its rows is applied row-wise

In [54]:
ar-ar[0]

array([[ 0,  0,  0],
       [ 9, 18, 27]])

In Pandas, the convention similarly operates row-wise by default:

In [55]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3, 4))
A

array([[6, 3, 7, 4],
       [6, 9, 2, 6],
       [7, 4, 3, 7]])

In [56]:
df = pd.DataFrame(A, columns=list('QRST'))
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [57]:
df.iloc[0]

Q    6
R    3
S    7
T    4
Name: 0, dtype: int32

In [58]:
# array([6,3,7,4]) broadcast
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


In [None]:
df.subtract(df.iloc[0], axis=1)

operate column-wise by specifying the ``axis`` keyword:

In [64]:
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [62]:
df['R']

0    3
1    9
2    4
Name: R, dtype: int32

In [65]:
# df['R']的 key(columns) 並無對應 df，所以col擴充
df-df['R']

Unnamed: 0,Q,R,S,T,0,1,2
0,,,,,,,
1,,,,,,,
2,,,,,,,


In [66]:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,3,0,4,1
1,-3,0,-7,-3
2,3,0,-1,3


In [68]:
# axis='index'
df.subtract(df['R'], axis='index')

Unnamed: 0,Q,R,S,T
0,3,0,4,1
1,-3,0,-7,-3
2,3,0,-1,3


In [67]:
# DataFrame 與 Series 運算預設 axis=1
df.subtract(df['R'], axis=1)
#df.subtract(df['R'])

Unnamed: 0,Q,R,S,T,0,1,2
0,,,,,,,
1,,,,,,,
2,,,,,,,


In [69]:
# axis='columns'
df.subtract(df['R'], axis='columns')

Unnamed: 0,Q,R,S,T,0,1,2
0,,,,,,,
1,,,,,,,
2,,,,,,,
