In [2]:
import numpy as np
import pandas as pd

In [6]:
data = pd.DataFrame(np.random.randn(5, 4))

data.describe()

Unnamed: 0,0,1,2,3
count,5.0,5.0,5.0,5.0
mean,-0.088309,0.982055,0.202703,0.536388
std,0.540356,1.637978,1.079232,1.166867
min,-0.683635,0.043921,-1.027131,-0.813697
25%,-0.363754,0.188702,-0.687264,-0.411622
50%,-0.159898,0.321759,0.208985,0.529626
75%,0.003064,0.456599,1.01358,1.503559
max,0.76268,3.899295,1.505343,1.874072


In [12]:
(np.abs(data) > 3)

Unnamed: 0,0,1,2,3
0,False,True,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [14]:
(np.abs(data) > 3).any()

0    False
1     True
2    False
3    False
dtype: bool

In [15]:
(np.abs(data) > 3).any(1)

0     True
1    False
2    False
3    False
4    False
dtype: bool

### ndarray와 DataFrame에서 불리언 인덱싱의 차이

ndarray: 조건에 해당하는 원소만 포함하는 1차원 배열을 반환한다.

In [24]:
arr2d = np.arange(12).reshape(3, 4)
arr2d[arr2d > 5]

array([ 6,  7,  8,  9, 10, 11])

DataFrame: NaN이 포함된 DataFrame을 반환한다.

In [53]:
df = pd.DataFrame(np.arange(12).reshape(3, 4))
df[df > 5]

Unnamed: 0,0,1,2,3
0,,,,
1,,,6.0,7.0
2,8.0,9.0,10.0,11.0


In [71]:
df[df > 5] = pd.Series([10, 20, 30, 40])
df

ValueError: Must specify axis=0 or 1

In [74]:
arr2d[[0]]

array([[0, 1, 2, 3]])

In [81]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [82]:
data.map(lambda x: 'gmail' in x)  # 에러

TypeError: argument of type 'float' is not iterable

In [88]:
se = pd.Series(np.arange(4), index=list('abcd'))
se.unstack()

ValueError: index must be a MultiIndex to unstack, <class 'pandas.core.indexes.base.Index'> was passed

### 8.1.3 DataFrame의 칼럼 사용하기

In [14]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two',
                            'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [15]:
frame.set_index('c')  # 칼럼 label

Unnamed: 0_level_0,a,b,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,7,0
one,1,6,1
one,2,5,2
two,3,4,0
two,4,3,1
two,5,2,2
two,6,1,3


In [16]:
frame.set_index(['d', 'c'])  # 칼럼 label의 list

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
d,c,Unnamed: 2_level_1,Unnamed: 3_level_1
0,one,0,7
1,one,1,6
2,one,2,5
0,two,3,4
1,two,4,3
2,two,5,2
3,two,6,1


In [19]:
frame.set_index(['c', pd.Index(range(7), name='e')])  # index 객체와 함께 사용

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,d
c,e,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,0,0,7,0
one,1,1,6,1
one,2,2,5,2
two,3,3,4,0
two,4,4,3,1
two,5,5,2,2
two,6,6,1,3


### 8.2.1 데이터베이스 스타일로 DataFrame 합치기

In [8]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
display(df1)
display(df2)

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [29]:
pd.merge(df1, df2, indicator=True)

Unnamed: 0,key,data1,data2,_merge
0,b,0,1,both
1,b,1,1,both
2,b,6,1,both
3,a,2,0,both
4,a,4,0,both
5,a,5,0,both


### 8.3.1 계층적 색인으로 재형성하기

In [38]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'],
                    name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [39]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [44]:
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns=pd.Index(['left', 'right'], name='side'))
df
df2 = df.unstack('state')
df2

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [49]:
df2['left']['Ohio']

number
one      0
two      1
three    2
Name: (left, Ohio), dtype: int32

In [51]:
df2['left', 'Ohio']  # 이중 인덱싱 대신 쉼표로 대체 가능

number
one      0
two      1
three    2
Name: Ohio, dtype: int32

In [54]:
df2.loc['one', 'left']

state
Ohio        0
Colorado    3
Name: one, dtype: int32

In [None]:
df2['one', 'left']  # 에러; 행부터 접근하려면 loc 써야됨

In [58]:
df2.loc['one', 'left']['Ohio']  # 이중 인덱싱

0

In [None]:
df2.loc['one', 'left', 'Ohio']  # 에러; 쉼표로 대체 불가

In [64]:
df2.loc['one', ('left', 'Ohio')]  # 한 축의 계층적 색인은 튜플로 전달

0

In [70]:
df2.loc['one', ('left', 'Ohio')]  # 에러; 리스트 전달 시 에러(여러 개의 같은 레벨 선택 시 사용)

0

> It is important to note that tuples and lists are not treated identically in pandas when it comes to indexing. Whereas a tuple is interpreted as one multi-level key, a list is used to specify several keys. Or in other words, tuples go horizontally (traversing levels), lists go vertically (scanning levels).

In [71]:
df2

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [72]:
df2.unstack()

side   state     number
left   Ohio      one        0
                 two        1
                 three      2
       Colorado  one        3
                 two        4
                 three      5
right  Ohio      one        5
                 two        6
                 three      7
       Colorado  one        8
                 two        9
                 three     10
dtype: int32

In [73]:
df2.stack()

Unnamed: 0_level_0,side,left,right
number,state,Unnamed: 2_level_1,Unnamed: 3_level_1
one,Ohio,0,5
one,Colorado,3,8
two,Ohio,1,6
two,Colorado,4,9
three,Ohio,2,7
three,Colorado,5,10


### 8.3.2 Long 형식에서 Wide 형식으로 피벗하기

In [104]:
data = pd.read_csv('pydata-book/examples/macrodata.csv')
display(data.head())

periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')

data = data.reindex(columns=columns)

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [105]:
data.index = periods.to_timestamp('D', 'end')

periods.to_timestamp('D', 'end')

DatetimeIndex(['1959-03-31 23:59:59.999999999',
               '1959-06-30 23:59:59.999999999',
               '1959-09-30 23:59:59.999999999',
               '1959-12-31 23:59:59.999999999',
               '1960-03-31 23:59:59.999999999',
               '1960-06-30 23:59:59.999999999',
               '1960-09-30 23:59:59.999999999',
               '1960-12-31 23:59:59.999999999',
               '1961-03-31 23:59:59.999999999',
               '1961-06-30 23:59:59.999999999',
               ...
               '2007-06-30 23:59:59.999999999',
               '2007-09-30 23:59:59.999999999',
               '2007-12-31 23:59:59.999999999',
               '2008-03-31 23:59:59.999999999',
               '2008-06-30 23:59:59.999999999',
               '2008-09-30 23:59:59.999999999',
               '2008-12-31 23:59:59.999999999',
               '2009-03-31 23:59:59.999999999',
               '2009-06-30 23:59:59.999999999',
               '2009-09-30 23:59:59.999999999'],
              dtype=

In [107]:
display(data)

ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,2710.349,0.00,5.8
1959-06-30 23:59:59.999999999,2778.801,2.34,5.1
1959-09-30 23:59:59.999999999,2775.488,2.74,5.3
1959-12-31 23:59:59.999999999,2785.204,0.27,5.6
1960-03-31 23:59:59.999999999,2847.699,2.31,5.2
...,...,...,...
2008-09-30 23:59:59.999999999,13324.600,-3.16,6.0
2008-12-31 23:59:59.999999999,13141.920,-8.79,6.9
2009-03-31 23:59:59.999999999,12925.410,0.94,8.1
2009-06-30 23:59:59.999999999,12901.504,3.37,9.2


Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,unemp,9.200
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560
