In [31]:
import pandas as pd
import numpy as np

## 5.1 pandas 소개

### 5.1.1 Series
 - 1차원 배열의 구조(어떤 numpy 자료형도 담을 수 있음)

In [2]:
obj = pd.Series([4,7,-5,3])

In [3]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

배열과 색인 객체는 values와 index로 얻을 수 있음

In [4]:
obj.values

array([ 4,  7, -5,  3])

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

색인 지정한 Series

In [7]:
obj2 = pd.Series([4,7,-5,3], index=['d','b','c','d'])
obj2

d    4
b    7
c   -5
d    3
dtype: int64

In [8]:
obj2['d']

d    4
d    3
dtype: int64

In [9]:
obj2[obj2 >0]

d    4
b    7
d    3
dtype: int64

python의 사전형과 비슷

In [11]:
'd' in obj2

True

사전형을 Series로 저장(사전의 key값이 index값으로 들어감)

In [13]:
sdata = {'Ohio' : 35000, 'Texas': 71000, 'Oregon' : 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [14]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

California에 대한 값을 찾을수 없기 때문에 NA값으로 취급됨.

In [15]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

name 속성

In [17]:
obj4.name = 'population'
obj4.index.name = 'state'

obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [18]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### 5.1.2 DataFrame
 - 색인의 모양이 같은 Series객체를 담고 있는 사전

In [19]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year' : [2000, 2001, 2002, 2001, 2002], 
       'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]}

frame = pd.DataFrame(data)

In [20]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [21]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [23]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index=['one', 'two','three', 'four', 'five'])
frame2 # 없는 값은 NaN 처리

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [24]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [25]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [26]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [28]:
frame2.loc['three'] # ix는 deprecated

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [29]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [32]:
frame2['debt'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


리스트나 배열을 컬럼에 대입할 때는 DataFrame의 크기와 대입하려는 값의 길이가 같아야 함. Series를 대입하면 DataFrame의 색인에 따라 값이 대입되며 없는 색인에는 값이 대입되지 않음.

In [34]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


중첩된 사전을 통한 DataFrame생성(바깥에 있는 사전의 키 값이 컬럼이 되고, 안에있는 키 값이 로우가 됨)

In [36]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [38]:
frame3 = pd.DataFrame(pop)

In [41]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [42]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [43]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [44]:
pdata = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [45]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [46]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

### 5.1.3 색인 객체
 - 표 형식의 데이터에서 각 로우의 컬럼에 대한 이름과 다른 메타데이터를 저장하는 객체

In [47]:
obj = pd.Series(range(3), index=['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [49]:
index[1] = 'd' # 변경 불가(TypeError: Index does not support mutable operations)

TypeError: Index does not support mutable operations

In [52]:
index = pd.Index(np.arange(3))
obj2 = pd.Series([1.5, -2.5, 0], index=index)
obj2.index is index

True

In [57]:
from collections import OrderedDict # 입력한 순서대로 dataframe 만들기 위함

In [56]:
# index 객체 설명
pd.DataFrame(OrderedDict({'클래스': ['Index', 'Int64Index', 'MultiIndex', 'DatetimeIndex', 'PeriodIndex'], '설명': ['가장 일반적인 Index객체이며, 파이썬 객체의 NumPy 배열 형식으로 축이름을 표현', '정수 값을 위한 특수한 Index', '단일 축에 여러 단계의 색인을 표현하는 계층적 색인 객체, 튜플의 배열과 유사', '나노초 타임스탬프를 지정한다(Numpy의 datetime64 dtype으로 표현)', '기간 데이터를 위한 특수 Index'] }))

Unnamed: 0,클래스,설명
0,Index,"가장 일반적인 Index객체이며, 파이썬 객체의 NumPy 배열 형식으로 축이름을 표현"
1,Int64Index,정수 값을 위한 특수한 Index
2,MultiIndex,"단일 축에 여러 단계의 색인을 표현하는 계층적 색인 객체, 튜플의 배열과 유사"
3,DatetimeIndex,나노초 타임스탬프를 지정한다(Numpy의 datetime64 dtype으로 표현)
4,PeriodIndex,기간 데이터를 위한 특수 Index


In [58]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [59]:
'Ohio' in frame3.columns

True

In [60]:
2003 in frame3.index

False

## 5.2 핵심 기능

### 5.2.1 재색인

In [61]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [62]:
obj2 = obj.reindex(['a','b','c','d','e']) # 색인 재배열(없는 색인 값은 NaN)
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [63]:
obj.reindex(['a','b','c','d','e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [66]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0,2,4])
obj3 = obj3.reindex(range(6), method='ffill') # 없는 값의 경우 앞의 값으로 채움(1,3,5 값 확인)
obj3

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

reindex는 색인과 컬럼 둘다 변경 가능

In [67]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [68]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [69]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states) # columns예약어 사용
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


pandas reindex method 인자 참고 : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html

### 5.2.2 하나의 로우 또는 컬럼 삭제

In [70]:
obj = pd.Series(np.arange(5.), index=['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [73]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])

In [75]:
data = data.drop(['Colorado', 'Ohio'])
data

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [76]:
data.drop('two', axis=1) # 1: column 방향 , 0: row 방향

Unnamed: 0,one,three,four
Utah,8,10,11
New York,12,14,15


In [77]:
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Utah,8,10
New York,12,14


### 5.2.3 색인하기, 선택하기, 거르기

In [78]:
obj = pd.Series(np.arange(4), index=['a','b','c','d'])
obj

a    0
b    1
c    2
d    3
dtype: int64

In [79]:
obj['b']

1

In [81]:
obj['b':'c'] # 시작점과 끝점을 포함하여 slice(일반 파이썬과 다름)

b    1
c    2
dtype: int64

In [83]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [84]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

row 슬라이스

In [85]:
data[:2] 

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [86]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [87]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


ix, loc 활용

In [95]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [109]:
data.ix[['Colorado', 'Utah'], [3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [97]:
data.ix[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [100]:
data.loc[:'Utah', 'two'] # data.ix[:'Utah', 'two]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [108]:
data.ix[data.three > 5, :3] # three가 5보다 큰 값을 3번째 열까지 출력

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


###### iloc vs loc vs ix 차이
- .iloc : integer positon를 통해 값을 찾을 수 있다. label로는 찾을 수 없다
- .loc : label 을 통해 값을 찾을 수 있다. integer position로는 찾을 수 없다.
- .ix : integer position과 label모두 사용 할 수 있다. 만약 label이 숫자라면 label-based index만 된다.(deprecated 예정 ver. 0.20 above)

### 5.2.4 산술연산 데이터 정렬

In [110]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a','c','d','e'])
s2 = pd.Series([-2.3, 3.6, -1.5, 4, 3.1], index=['a','c','e','f','g'])

In [111]:
s1+s2

a    5.0
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [112]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [113]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


산술연산 메서드에 채워 넣을 값 정하기

In [114]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)), columns=list('abcde'))

In [115]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [116]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


DataFrame과 Series 간의 연산

In [117]:
arr = np.arange(12.).reshape((3,4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [118]:
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [121]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]

In [122]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [123]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [125]:
frame - series # Series의 색인(b,d,e)를 컬럼에 맞추고 아래로우로 전파

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [126]:
series2 = pd.Series(range(3), index=['b','e','f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


각 로우에 대해 연산 : 산술연산 메서드

In [127]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [130]:
frame.sub(series3, axis=0) # pandas.Index(row) 방향

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### 5.2.5 함수 적용과 매핑

In [132]:
frame = pd.DataFrame(np.random.randn(4,3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [133]:
frame

Unnamed: 0,b,d,e
Utah,-1.976611,-0.743654,-0.983979
Ohio,1.077099,0.447901,1.038791
Texas,0.335125,0.197798,0.34637
Oregon,0.453287,-0.418911,0.182783


1차원 배열 함수 적용

In [137]:
f = lambda x: x.max() - x.min()

frame.apply(f, axis=0)

b    3.053710
d    1.191555
e    2.022770
dtype: float64

In [140]:
def f(x):
    return pd.Series([x.min() , x.max()], index=['min', 'max'])

In [141]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.976611,-0.743654,-0.983979
max,1.077099,0.447901,1.038791


In [142]:
format = lambda x: '%.2f' % x

frame.applymap(format) # Series가 각 원소에 적용할 함수 지정을 위한 map메서드를 가지고있음

Unnamed: 0,b,d,e
Utah,-1.98,-0.74,-0.98
Ohio,1.08,0.45,1.04
Texas,0.34,0.2,0.35
Oregon,0.45,-0.42,0.18


In [143]:
frame['e'].map(format)

Utah      -0.98
Ohio       1.04
Texas      0.35
Oregon     0.18
Name: e, dtype: object

### 5.2.6 정렬과 순위

In [144]:
obj = pd.Series(range(4), index=['d','a','b','c'])
obj.sort_index() # index 정렬

a    1
b    2
c    3
d    0
dtype: int64

In [145]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index=['three', 'one'], columns=['d','a','b','c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [148]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [150]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [152]:
# 하나 이상의 컬럼 정렬
frame = pd.DataFrame({'b': [4,7,-3,2], 'a': [0,1,0,1]})
frame.sort_values(by=['a','b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [155]:
# rank : 동점항목에 대해서 평균 순위를 매김(Series, DataFrame 둘다)
obj = pd.Series([7,-5,7,4,2,0,4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [156]:
obj.rank(method='first') # 데이터 상에 나타나는 순위로 표시

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [157]:
obj.rank(ascending=False, method='max') # 내림차순

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [159]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0,1,0,1], 'c': [-2,5,8,-2.5]})
frame.rank(axis=0)

Unnamed: 0,a,b,c
0,1.5,3.0,2.0
1,3.5,4.0,3.0
2,1.5,1.0,4.0
3,3.5,2.0,1.0


In [161]:
pd.DataFrame(OrderedDict({'메서드': ['average', 'min', 'max', 'first'], 
                          '설명': ['기본값: 같은 값을 가지는 항목의 평균 값을 순위로 삼음', 
                                 '같은 값을 가지는 그룹을 낮은 순위로 매김',
                                 '같은 값을 가지는 그룹을 높은 순으로 매김',
                                 '데이터 내에서 위치에 따라 순위를 매김']}))

Unnamed: 0,메서드,설명
0,average,기본값: 같은 값을 가지는 항목의 평균 값을 순위로 삼음
1,min,같은 값을 가지는 그룹을 낮은 순위로 매김
2,max,같은 값을 가지는 그룹을 높은 순으로 매김
3,first,데이터 내에서 위치에 따라 순위를 매김


### 5.2.7 중복 색인

In [162]:
obj = pd.Series(range(5), index=['a','b','c','d','e'])
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [163]:
obj.index.is_unique

True

In [164]:
df = pd.DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,-0.250612,1.373209,1.434974
a,-1.269067,1.364246,-0.160144
b,-0.213553,0.67791,0.788171
b,0.413303,-0.178418,0.401372


In [166]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.213553,0.67791,0.788171
b,0.413303,-0.178418,0.401372


## 5.3 기술통계 계산과 요약

In [167]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a','b','c','d'], columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [172]:
df.sum(axis=0) # 열당 행 합계

one    9.25
two   -5.80
dtype: float64

In [171]:
df.mean(axis=1) # 행당 열평균

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [175]:
df.idxmax() # 열당 최대값을 가진 row index

one    b
two    d
dtype: object

In [177]:
df.cumsum() # 열에 해당하는 row의 누산값

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [183]:
df.cumsum(axis=1) # row에 해당하는 colum의 누산값

Unnamed: 0,one,two
a,1.4,
b,7.1,2.6
c,,
d,0.75,-0.55


참고 : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html#pandas.DataFrame.describe

### 5.3.1 상관관계와 공분산

In [200]:
# import pandas.io.data as web
# from pandas_datareader import data, wb
import pandas_datareader as pdr
import fix_yahoo_finance as yf
import datetime
yf.pdr_override()

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = pdr.get_data_yahoo(ticker,start=datetime.datetime(2006, 10, 1),end=datetime.datetime(2012, 1, 1))
    
# price = pd.DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
# volumn = pd.DataFrame({tic: data['Volumn'] for tic, data in all_data.items()})

ImmediateDeprecationError: 
Yahoo Actions has been immediately deprecated due to large breaks in the API without the
introduction of a stable replacement. Pull Requests to re-enable these data
connectors are welcome.

See https://github.com/pydata/pandas-datareader/issues


### 5.3.2 유일 값, 값 세기, 멤버십

In [201]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])

In [202]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [205]:
obj.value_counts() # 도수분포표 계산

a    3
c    3
b    2
d    1
dtype: int64

In [206]:
pd.value_counts(obj.values, sort=False)

c    3
b    2
a    3
d    1
dtype: int64

참고 : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.unique.html

## 5.4 누락된 데이터 처리

In [207]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [208]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 5.4.1 누락된 데이터 골라내기

In [210]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [211]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [212]:
data = pd.DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])
cleaned = data.dropna()


In [213]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [214]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [215]:
data.dropna(how='all') # 모든 값이 NA인 로우만 제외

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [218]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [219]:
df = pd.DataFrame(np.random.randn(7,3))
df.ix[:4, 1] = NA; df.loc[:2, 2] = NA
df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,0,1,2
0,-1.035812,,
1,-0.302342,,
2,1.407668,,
3,-0.294273,,-0.706658
4,0.864715,,0.118654
5,-0.842773,0.465103,1.130324
6,0.394661,-0.479773,0.927573


In [220]:
df.dropna(thresh=3) # 몇개 이상의 값이 들어있는 row만

Unnamed: 0,0,1,2
5,-0.842773,0.465103,1.130324
6,0.394661,-0.479773,0.927573


### 5.4.2 누락된 값 채우기

In [221]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.035812,0.0,0.0
1,-0.302342,0.0,0.0
2,1.407668,0.0,0.0
3,-0.294273,0.0,-0.706658
4,0.864715,0.0,0.118654
5,-0.842773,0.465103,1.130324
6,0.394661,-0.479773,0.927573


In [222]:
df.fillna({1: 0.5, 3: -1}) # 사전형태로 각 컬럼마다 다른 값을 채울 수 있음

Unnamed: 0,0,1,2
0,-1.035812,0.5,
1,-0.302342,0.5,
2,1.407668,0.5,
3,-0.294273,0.5,-0.706658
4,0.864715,0.5,0.118654
5,-0.842773,0.465103,1.130324
6,0.394661,-0.479773,0.927573


In [223]:
_ = df.fillna(0, inplace=True)

In [224]:
df

Unnamed: 0,0,1,2
0,-1.035812,0.0,0.0
1,-0.302342,0.0,0.0
2,1.407668,0.0,0.0
3,-0.294273,0.0,-0.706658
4,0.864715,0.0,0.118654
5,-0.842773,0.465103,1.130324
6,0.394661,-0.479773,0.927573


In [225]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean()) # NA에 평균값을 넣기

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 5.5 계층적 색인

In [226]:
data = pd.Series(np.random.randn(10), index=[['a','a','a','b','b','b','c','c','d','d'], [1,2,3,1,2,3,1,2,2,3]])
data

a  1   -0.627816
   2   -0.602053
   3   -0.823667
b  1   -1.055761
   2   -0.204971
   3    1.777031
c  1   -0.044069
   2   -0.348337
d  2   -0.516116
   3    1.689519
dtype: float64

In [227]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [229]:
data['b':'c'] # 부분적 색인 접근

b  1   -1.055761
   2   -0.204971
   3    1.777031
c  1   -0.044069
   2   -0.348337
dtype: float64

In [232]:
data[:,2] # 하위 계층 선택(1레벨(a,b,c,d) 중 하위레벨이 2인 값들만)

a   -0.602053
b   -0.204971
c   -0.348337
d   -0.516116
dtype: float64

In [234]:
data.unstack().stack()

a  1   -0.627816
   2   -0.602053
   3   -0.823667
b  1   -1.055761
   2   -0.204971
   3    1.777031
c  1   -0.044069
   2   -0.348337
d  2   -0.516116
   3    1.689519
dtype: float64

In [235]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)), index=[['a','a','b','b'], [1,2,1,2]], columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [239]:
frame.index.names = ['key1', 'key2']

In [241]:
frame.columns.names = ['state', 'color']

In [242]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [243]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


### 5.5.1 계층 순서 바꾸고 정렬

In [245]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [248]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [250]:
frame.sort_index(1) # column sorting

Unnamed: 0_level_0,state,Colorado,Ohio,Ohio
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


### 5.5.2 단계별 요약 통계

In [254]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [256]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### 5.5.3 DataFrame의 칼럼 사용하기

In [257]:
frame = pd.DataFrame({'a' : range(7), 'b': range(7,0,-1), 'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'], 'd': [0,1,2,0,1,2,3]})

In [258]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [259]:
frame2 = frame.set_index(['c','d'])

In [260]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [263]:
frame2.reset_index() # 단계 색인 제거

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## 5.6 Pandas와 관련된 기타주제

### 5.6.1 정수 색인

In [271]:
# 위치 기반의 색인이 필요한 경우
ser3 = pd.Series(range(3), index=[-5, 1,3])
ser3

-5    0
 1    1
 3    2
dtype: int64

In [272]:
ser3.iloc(2) # deprecated : iget_value

<pandas.core.indexing._iLocIndexer at 0x1125bd390>

In [273]:
frame = pd.DataFrame(np.arange(6).reshape(3,2), index=[2,0,1])
frame

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


In [275]:
frame.iloc(0) # deprecated : irow

<pandas.core.indexing._iLocIndexer at 0x1121a3978>