In [186]:
import numpy as np
import pandas as pd
import seaborn as sns
rng = np.random.RandomState(42)
# Series에서 name 의 역할
# DataFrame에서 columns의 역할
# DataFrame 에서 columns 대신 name 사용이 가능한가?

In [2]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [3]:
# 함수를 그룹단위로 적용시키기 -> apply() method 사용

# Pandas

## Object 종류 ( Series, DataFrame, Index )

### Series
* 1차원 array; list나 array를 통해 생성 가능

#### 선언

In [4]:
no_use_index_data = pd.Series([0.25, 0.5, 0.75, 1.0])
print("< no_use_index_data >\n",no_use_index_data)
print()
use_index_data = pd.Series([1.25, 1.5, 1.75, 2.0],index= ['a', 'b', 'c', 'd'])
print("< use_index_data >\n", use_index_data)

< no_use_index_data >
 0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

< use_index_data >
 a    1.25
b    1.50
c    1.75
d    2.00
dtype: float64


value 값은 반복 채움 형태로 사용 가능

In [5]:
pd.Series(0, index=[100,200,300,400,500])

100    0
200    0
300    0
400    0
500    0
dtype: int64

#### values
* Numpy array 이며, Series 생성 시 작성한 array

In [6]:
no_use_index_data.values

array([0.25, 0.5 , 0.75, 1.  ])

#### index
* values의 인덱스 번호.( key값 )이며 변경 가능. slicing 이나 index 번호로 접근하기 가능

In [7]:
print("no_use_index_data.index : ",no_use_index_data.index)
print()
print("data[0] : ",no_use_index_data[0])
print()
print("< data[1:3] >\n",no_use_index_data[1:3])
print()
print("use_index_data['b'] : ",use_index_data['b'])
print()
print("<use_index_data['a':'c']>\n",use_index_data['a':'c'])

no_use_index_data.index :  RangeIndex(start=0, stop=4, step=1)

data[0] :  0.25

< data[1:3] >
 1    0.50
2    0.75
dtype: float64

use_index_data['b'] :  1.5

<use_index_data['a':'c']>
 a    1.25
b    1.50
c    1.75
dtype: float64


#### Value 값이 Dictionary 인 경우

In [8]:
use_dic = pd.Series({2:'a', 1:'b', 3:'c'})
print(use_dic)
print()
explicit_index = pd.Series({2:'a', 1:'b', 3:'c'}, index=[3,2])
print("Explicit Index )\n", explicit_index)

2    a
1    b
3    c
dtype: object

Explicit Index )
 3    c
2    a
dtype: object


### DataFrame
* index를 공유하는 Series의 모임. Numpy의 2차원 배열과 유사

#### 선언
* dictionary, Series, Numpy 등을 이용하여 DataFrame 생성

In [9]:
area_dict = {'California' : 423967, 'Texas' : 695662, 'New York' : 141297, 'Florida' : 170312, 'Illinois' : 149995}
area = pd.Series(area_dict)
population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}
population = pd.Series(population_dict)

In [10]:
states = pd.DataFrame({'population':population, 'area':area}) # 바로 위에서 선언한 area와 population은 index값이 같다.
states # 공유하는 index가 같기 때문에 아래와 같은 결과를 얻을 수 있음
# 만약 index가 모두 일치하는 것이 아니라면, NaN 값이 들어감. 이후 자세히 다룸

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [11]:
# 참고 ) print를 이용하면 DataFrame 형태가 위의 코드처럼 보이지는 않음

In [12]:
Use_one_Series = pd.DataFrame(population, columns=['population'])
print("Use one Series )\n")
Use_one_Series

Use one Series )



Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [13]:
dict_data = [{'a':i, 'b':2*i} for i in range(3)]
Use_dict_data = pd.DataFrame(dict_data)
print("Use dict Data )\n")
Use_dict_data

Use dict Data )



Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [14]:
Use_dict_series = pd.DataFrame({'population':population, 'area':area})
print("Use dictionary of Series )\n")
Use_dict_series

Use dictionary of Series )



Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [15]:
Use_numpy = pd.DataFrame(np.random.rand(3,2), columns=['foo', 'bar'], index=['a','b','c'])
print("Use Two-Dimensional Numpy array )\n")
Use_numpy

Use Two-Dimensional Numpy array )



Unnamed: 0,foo,bar
a,0.779697,0.342999
b,0.088514,0.771575
c,0.902987,0.328147


#### values, index, colums
* colums = DataFrame 상단에 있는 values_name을 출력

In [16]:
print("Index : ",states.index)
print()
print("Values )\n",states.values)
print()
print("Colums : ",states.columns)
print()
print("states['area'] )\n",states['area']) # area의 값만 출력 가능

Index :  Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

Values )
 [[38332521   423967]
 [26448193   695662]
 [19651127   141297]
 [19552860   170312]
 [12882135   149995]]

Colums :  Index(['population', 'area'], dtype='object')

states['area'] )
 California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


### Index

*   수정 불가
*   집합 연산 가능(하지만, 중복값 기입도 가능)



In [17]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [18]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


## Data에 접근하기

### In Series

1.   Indexing 이용
2.   keys() 와 index method 이용
3.   items() 이용



In [19]:
se_data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a','b','c','d'])
se_data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

#### Data에서 특정 정보 보기

In [20]:
print("se_data['b'] : ", se_data['b']) # 하나의 인덱스 확인
print("Boolean 'a' in se_data : ", 'a' in se_data) # 특정 인덱스가 포함되는가 확인
print("se_data.keys() : ", se_data.keys()) # Data의 key 값들 확인
print("se_data.index : ", se_data.index) # Data의 index 값들 확인
print(list(se_data.items())) # Data의 index,key 값을 list 형태로 확인

se_data['b'] :  0.5
Boolean 'a' in se_data :  True
se_data.keys() :  Index(['a', 'b', 'c', 'd'], dtype='object')
se_data.index :  Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


##### Use loc & iloc -> index

*   loc : 명시적 index 표현임.(= index 이름을 사용하겠다)
*   iloc : 비명시적 index 표현임.(= 0 부터 n까지 번호를 주는 기본 index를 활용)



In [21]:
prove_data = pd.Series(['a','b','c'], index=[1,3,5])
prove_data

1    a
3    b
5    c
dtype: object

In [22]:
print("prove_data[1] : ",prove_data[1])
print("prove_data.loc[1] : ",prove_data.loc[1])
print("prove_data.iloc[1] : ",prove_data.iloc[1])

prove_data[1] :  a
prove_data.loc[1] :  a
prove_data.iloc[1] :  b


In [23]:
se_data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [24]:
se_data.loc['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [25]:
se_data[0:2]

a    0.25
b    0.50
dtype: float64

In [26]:
se_data.iloc[0:2]

a    0.25
b    0.50
dtype: float64

##### 조건을 만족하는 값

In [27]:
se_data[(se_data> 0.3) & (se_data<0.8)]

b    0.50
c    0.75
dtype: float64

##### Fancy indexing 활용

In [28]:
se_data[['a', 'd']]

a    0.25
d    1.00
dtype: float64

#### Data 수정하기

In [29]:
se_data_md = se_data

In [30]:
se_data_md['e'] = 1.25
se_data_md['a'] = 0
se_data_md

a    0.00
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

### In DataFrame

In [31]:
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


##### Data 정보 보기

In [32]:
states['area'] # = states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [33]:
states.values[0]

array([38332521,   423967])

###### iloc 과 loc 사용

In [34]:
states.iloc[:3, :2]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


In [35]:
states.loc[:'Illinois', :'population']

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [36]:
states['density'] = states['population'] / states['area']
states.loc[states.density > 100, ['population','density']]

Unnamed: 0,population,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [37]:
states.iloc[0,2] = 90
states

Unnamed: 0,population,area,density
California,38332521,423967,90.0
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


##### Data 값 추가하기

In [38]:
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


##### DataFrame 의 row <--> column

In [39]:
states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


##### 조건 만족 값

In [40]:
states[states.density > 100]

Unnamed: 0,population,area,density
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121


## 연산

1.   범용 함수 사용 ( add, sub, mul, div, floordiv, mod, pow)
2.   Aggregation( sum, mean, min, max, count, first, last, std, var, prod, mad)
3.   eval 과 query



### 1) 범용 함수 사용

아래 표는 Python operator와 해당 Pandas object method 리스트를 보여준다. 

| Python Operator | Pandas Method(s)                      |
|-----------------|---------------------------------------|
| ``+``           | ``add()``                             |
| ``-``           | ``sub()``, ``subtract()``             |
| ``*``           | ``mul()``, ``multiply()``             |
| ``/``           | ``truediv()``, ``div()``, ``divide()``|
| ``//``          | ``floordiv()``                        |
| ``%``           | ``mod()``                             |
| ``**``          | ``pow()``                             |


#### In Series

In [41]:
# 연산할 Series 1쌍 생성
u_area = pd.Series({'Alask' : 1723337, 'Texas' : 695662, 'California' : 423967}, name='u_area')
u_population = pd.Series({'California' : 38332521, 'Texas':26448193, 'New York':19651127}, name='u_population')

In [42]:
# Series 1쌍의 연산
print("u_population / u _area\n")
print(u_population/u_area)
print()
print("u_area.index | u_population.index\n")
print(u_area.index | u_population.index)

u_population / u _area

Alask               NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

u_area.index | u_population.index

Index(['Alask', 'California', 'New York', 'Texas'], dtype='object')


  


In [43]:
# 연산할 Series 2쌍 생성 및 연산
U_A = pd.Series([2, 4, 6], index=[0,1,2])
U_B = pd.Series([1, 3, 5], index=[1,2,3])
U_A + U_B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

##### 행이나 열의 개수가 다른 배열 연산
* Broadcasting을 사용한다.

In [44]:
# Broadcasting에 따른 계산 결과
B_a = rng.randint(10, size=(3,4))
B_a

array([[6, 3, 7, 4],
       [6, 9, 2, 6],
       [7, 4, 3, 7]])

In [45]:
B_a - B_a[0]

array([[ 0,  0,  0,  0],
       [ 0,  6, -5,  2],
       [ 1,  1, -4,  3]])

##### 연산 결과 NaN 처리

In [46]:
# 연산 결과 NaN 처리 방법 중 1
U_A.add(U_B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

#### In DataFrame

In [47]:
# 연산할 DataFrame 1 생성
u_dx = pd.DataFrame(rng.randint(0, 20, (2,2)), columns=list('AB'))
u_dx

Unnamed: 0,A,B
0,2,1
1,11,5


In [48]:
# 연산할 DataFrame 2 생성
u_dy = pd.DataFrame(rng.randint(0,10, (3,3)), columns=list('BAC'))
u_dy

Unnamed: 0,B,A,C
0,1,4,0
1,9,5,8
2,0,9,2


In [49]:
# 1, 2 연산
u_dx + u_dy

Unnamed: 0,A,B,C
0,6.0,2.0,
1,16.0,14.0,
2,,,


##### 행 또는 열단위 연산
* axis 활용

In [50]:
d_a = pd.DataFrame(B_a,columns=list('QRST'))
d_a

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [51]:
d_a - d_a.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


In [52]:
d_a.subtract(d_a.iloc[0], axis=1) # d_a.iloc[0]에는 4개의 열이 존재한다. axis=0으로 하면 개수가 3개인 행 방향으로 진행되기 때문에,
# broadcasting이 발생하여 이상한 값이 출력된다.
# 결론만 요약하면, 연산할 기준이 행이면, 방향은 열이 되어야 한다.

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


In [53]:
d_a.subtract(d_a['R'], axis=0) 

Unnamed: 0,Q,R,S,T
0,3,0,4,1
1,-3,0,-7,-3
2,3,0,-1,3


##### 연산 결과 NaN 자리 평균 넣기

In [54]:
# NaN 자리에 평균값 넣기
Dchange_NaN = u_dx.stack().mean()
u_dx.add(u_dy, fill_value=Dchange_NaN)

Unnamed: 0,A,B,C
0,6.0,2.0,4.75
1,16.0,14.0,12.75
2,13.75,4.75,6.75


In [55]:
#위의 내용 구조 보기
print("u_dx")
print(u_dx,"\n") # u_dx 원본 파악
print("u_dx.stack()") 
print(u_dx.stack(),"\n") # u_dx를 stack 한 결과 -> columns가 index로 각각 들어감
print("u_dx.mean()")
print(u_dx.mean(),"\n") # u_dx 원본에 mean을 썼을 때 결과 -> 각 행의 평균 값 출력
print("u_dx.stack().mean()")
print(u_dx.stack().mean()) # u_dx를 stack한 후 mean 사용 결과 -> stack을 통해 columns가 하나가 되어, mean 결과가 하나가 나옴. 이때 mean은 u_dx 원본의 모든 values의 평균임

u_dx
    A  B
0   2  1
1  11  5 

u_dx.stack()
0  A     2
   B     1
1  A    11
   B     5
dtype: int64 

u_dx.mean()
A    6.5
B    3.0
dtype: float64 

u_dx.stack().mean()
4.75


### 2) Aggregation

아래 표는 built-in Pandas aggregation 함수를 요약한 것이다. 

| Aggregation              | Description                     |
|--------------------------|---------------------------------|
| ``count()``              | Total number of items           |
| ``first()``, ``last()``  | First and last item             |
| ``mean()``, ``median()`` | Mean and median                 |
| ``min()``, ``max()``     | Minimum and maximum             |
| ``std()``, ``var()``     | Standard deviation and variance |
| ``mad()``                | Mean absolute deviation         |
| ``prod()``               | Product of all items            |
| ``sum()``                | Sum of all items                |


#### Series

In [56]:
simple_agg = pd.Series(rng.rand(5))
simple_agg

0    0.382462
1    0.983231
2    0.466763
3    0.859940
4    0.680308
dtype: float64

In [57]:
# Series 합과 평균 구하기
print("Simple Aggregation.sum() : ", simple_agg.sum())
print("Simple Aggregation.mean() : ", simple_agg.mean())

Simple Aggregation.sum() :  3.372703715646031
Simple Aggregation.mean() :  0.6745407431292062


#### DataFrame

In [58]:
df_agg = pd.DataFrame({'A':rng.rand(5), 'B':rng.rand(5)})
df_agg

Unnamed: 0,A,B
0,0.450499,0.015966
1,0.013265,0.230894
2,0.942202,0.241025
3,0.563288,0.683264
4,0.385417,0.609997


In [59]:
# DataFrame 평균 구하기
df_agg.mean()

A    0.470934
B    0.356229
dtype: float64

In [60]:
df_agg.mean(axis=1) # 연산 방향 열(인덱스는 그대로)
#df_agg.mean(axis='columns') 와 동일

0    0.233233
1    0.122079
2    0.591614
3    0.623276
4    0.497707
dtype: float64

In [61]:
# 주요 aggregation 결과 출력
df_agg.describe()

Unnamed: 0,A,B
count,5.0,5.0
mean,0.470934,0.356229
std,0.334595,0.281118
min,0.013265,0.015966
25%,0.385417,0.230894
50%,0.450499,0.241025
75%,0.563288,0.609997
max,0.942202,0.683264


In [62]:
# NaN 버림
test = pd.DataFrame({'A':rng.rand(4), 'B':rng.rand(4)})
test2 = pd.DataFrame({'A':rng.rand(6), 'B':rng.rand(6)})
res = test - test2
res

Unnamed: 0,A,B
0,0.801882,0.428821
1,-0.66892,-0.145288
2,-0.058694,-0.312893
3,-0.212914,-0.393472
4,,
5,,


In [63]:
res.dropna()

Unnamed: 0,A,B
0,0.801882,0.428821
1,-0.66892,-0.145288
2,-0.058694,-0.312893
3,-0.212914,-0.393472


#### 조건에 따른 연산(Groupby)
* groupby 사용

In [64]:
gpb = pd.DataFrame({'key':['A','B','C','A','B','C'], 'data_1':range(6), 'data_2':rng.randint(0,10, 6)}, columns=['key','data_1','data_2'])
gpb

Unnamed: 0,key,data_1,data_2
0,A,0,4
1,B,1,7
2,C,2,9
3,A,3,8
4,B,4,8
5,C,5,0


In [65]:
# groupby method 결과
print("gpb.groupby('key')")
print(gpb.groupby('key'),"\n")
print("gpb.groupby('key').sum()")

gpb.groupby('key')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f29f0133390> 

gpb.groupby('key').sum()


In [66]:
gpb.groupby('key').sum()

Unnamed: 0_level_0,data_1,data_2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,12
B,5,15
C,7,9


In [67]:
gpb.groupby('key')['data_1']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f29f008c850>

In [68]:
gpb.groupby('key')['data_1'].median()

key
A    1.5
B    2.5
C    3.5
Name: data_1, dtype: float64

In [69]:
gpb.groupby('key')['data_2'].median()

key
A    6.0
B    7.5
C    4.5
Name: data_2, dtype: float64

##### groupby().aggregate()

In [70]:
gpb

Unnamed: 0,key,data_1,data_2
0,A,0,4
1,B,1,7
2,C,2,9
3,A,3,8
4,B,4,8
5,C,5,0


In [71]:
gpb.groupby('key').aggregate(['min', np.median, max])

Unnamed: 0_level_0,data_1,data_1,data_1,data_2,data_2,data_2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,4,6.0,8
B,1,2.5,4,7,7.5,8
C,2,3.5,5,0,4.5,9


In [72]:
gpb.groupby('key').aggregate({'data_1':'min','data_2':'max'})

Unnamed: 0_level_0,data_1,data_2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,8
B,1,8
C,2,9


In [73]:
gpb.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,data_1,data_2
0,-1.5,-2.0
1,-1.5,-0.5
2,-1.5,4.5
3,1.5,2.0
4,1.5,0.5
5,1.5,-4.5


In [74]:
L = [0, 1, 0, 1, 2, 1] # key 값이 L로 변경
display('gpb', 'gpb.groupby(L).sum()')

Unnamed: 0,key,data_1,data_2
0,A,0,4
1,B,1,7
2,C,2,9
3,A,3,8
4,B,4,8
5,C,5,0

Unnamed: 0,data_1,data_2
0,2,13
1,9,15
2,4,8


In [75]:
# index를 group key로 매핑한 dictionary 사용 가능
gpb_2 = gpb.set_index('key')
mapping = {'A': 'vowel' , 'B':'consonant', 'C':'consonant'}
display('gpb_2', 'gpb_2.groupby(mapping).sum()')

Unnamed: 0_level_0,data_1,data_2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,7
C,2,9
A,3,8
B,4,8
C,5,0

Unnamed: 0_level_0,data_1,data_2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
consonant,12,24
vowel,3,12


### 3) eval 과 query

In [76]:
import numexpr
x = rng.rand(1000000)
y = rng.rand(1000000)
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

True

In [77]:
nrows, ncols = 100000, 100
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))

In [78]:
# df1 + df2 + df3 + df4 를 빠르고 적은 메모리로 출력
pd.eval('df1 + df2 + df3 + df4')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,2.561369,3.044401,1.382681,1.231064,2.191970,2.743256,1.526697,1.329062,1.173678,1.360818,...,3.342814,3.247086,3.231267,1.823053,1.593704,2.123236,2.046907,2.901226,2.554672,2.021535
1,2.191476,2.056440,0.662390,2.774288,2.056219,2.539064,1.157651,1.756716,2.746227,2.096448,...,1.899765,2.150954,2.253358,1.774324,1.847826,3.009875,0.609044,0.928466,1.133396,1.032594
2,1.951580,2.061862,1.788145,2.113700,1.803748,3.095901,1.673772,1.485460,2.833710,2.307153,...,2.201745,1.380495,1.723956,3.072724,2.244600,0.952810,1.135276,2.127585,1.419563,1.725079
3,2.329409,1.710299,1.542559,2.347206,2.404835,2.119762,2.140090,1.675952,2.534931,1.493770,...,3.034900,1.402817,1.297959,1.887413,2.291277,2.811693,1.415584,1.733286,1.254888,1.423648
4,2.483899,1.567752,0.531401,1.692672,2.439861,3.058180,1.833190,1.970158,1.733438,2.170081,...,2.024021,1.811651,1.743026,2.761991,2.168994,1.188238,2.389701,1.662503,1.631819,1.110168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2.479969,1.670973,2.439533,3.102899,1.190393,3.195402,2.044621,1.812835,2.012336,1.433852,...,3.242257,1.528849,1.176621,1.481724,2.239144,2.026703,1.330812,1.421843,2.053927,2.090685
99996,2.608036,1.502818,0.817296,2.132779,1.636059,2.355447,1.716861,2.117861,2.320082,1.773073,...,1.450417,1.837748,1.047594,2.756591,2.338204,0.934797,1.997811,1.771455,2.016109,1.960579
99997,2.329723,1.795498,1.963579,1.896286,1.408729,1.026612,2.076714,1.558112,2.131139,2.609403,...,1.518204,1.637167,1.240723,1.459412,1.699759,3.260794,1.288179,2.027039,1.188966,2.328907
99998,2.046876,3.118000,1.776146,2.549077,1.848167,2.292836,1.985955,2.489993,3.553437,2.664197,...,1.503495,2.721996,2.674156,2.960464,2.644212,1.623106,2.925302,1.419036,2.001067,1.537407


In [79]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100,3))) for i in range(5))
pd.eval('df1 + df2 + df3 + df4 + df5')

Unnamed: 0,0,1,2
0,2703,1451,1941
1,2960,1619,3326
2,1790,2895,3566
3,2491,2554,2817
4,3104,2413,2699
...,...,...,...
95,2675,2520,3317
96,2550,2110,2446
97,2656,1348,2187
98,2322,2451,3522


In [80]:
pd.eval('-df1 * df2 / (df3 + df4) - df5')

Unnamed: 0,0,1,2
0,-725.812408,-142.120000,-470.471230
1,-934.670103,-540.735910,-1006.072072
2,-851.846512,-997.339816,-698.046411
3,-722.171198,-946.316129,-556.823908
4,-1569.450723,-643.647307,-671.138329
...,...,...,...
95,-781.273224,-805.668874,-1455.687566
96,-663.626938,-417.032550,-410.260870
97,-774.244290,-775.049689,-1262.280624
98,-617.209903,-828.374795,-1044.567426


In [81]:
pd.eval('df1 < df2 <= df3 != df4')

Unnamed: 0,0,1,2
0,True,False,False
1,False,False,True
2,False,False,True
3,False,False,False
4,False,False,True
...,...,...,...
95,False,False,False
96,False,False,True
97,False,True,False
98,False,False,False


In [82]:
# DataFrame 에서 name을 사용한 연산
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
result = pd.eval("(df.A + df.B) / (df.C - 1)")
result

0     -0.364728
1     -1.063594
2     -1.346499
3     -4.913855
4     -6.227490
         ...   
995   -1.439662
996   -2.991181
997   -1.435064
998   -3.828055
999   -0.645653
Length: 1000, dtype: float64

In [83]:
df.eval('D = (A+B)/C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.121867,0.173962,0.188904,1.566031
1,0.426752,0.587114,0.046755,21.684501
2,0.419834,0.727171,0.148158,7.741767
3,0.96438,0.761907,0.64869,2.661189
4,0.211779,0.287628,0.919806,0.542948


In [84]:
result2 = df.query('A < 0.5 and B < 0.5')
result2

Unnamed: 0,A,B,C,D
0,0.121867,0.173962,0.188904,1.566031
4,0.211779,0.287628,0.919806,0.542948
21,0.302805,0.454704,0.360669,2.100286
22,0.229581,0.326766,0.271940,2.045842
25,0.189131,0.108732,0.841121,0.354126
...,...,...,...,...
981,0.265245,0.341811,0.194425,3.122314
982,0.292709,0.337834,0.504142,1.250725
983,0.137429,0.389371,0.176876,2.978351
989,0.004854,0.021402,0.537343,0.048862


## Datatype

*   str
*   times



### 1) str

In [85]:
str_data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in str_data]

['Peter', 'Paul', 'Mary', 'Guido']

In [86]:
names = pd.Series(str_data)
names

0    peter
1     Paul
2     MARY
3    gUIDO
dtype: object

In [87]:
names.str.capitalize()

0    Peter
1     Paul
2     Mary
3    Guido
dtype: object

### 2) times

In [88]:
date = np.array('2015-07-04', dtype=np.datetime64)
date

array('2015-07-04', dtype='datetime64[D]')

In [89]:
date + np.arange(12)

array(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
       '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
       '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
      dtype='datetime64[D]')

In [90]:
# Pandas에서 Index를 Time으로 사용하기
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04', '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

In [91]:
pd.date_range('2015-07-03', '2015-07-10')

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [92]:
pd.date_range('2015-07-01', periods=10)

DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-04',
               '2015-07-05', '2015-07-06', '2015-07-07', '2015-07-08',
               '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [93]:
pd.date_range('2015-07-01', periods=10, freq='H')

DatetimeIndex(['2015-07-01 00:00:00', '2015-07-01 01:00:00',
               '2015-07-01 02:00:00', '2015-07-01 03:00:00',
               '2015-07-01 04:00:00', '2015-07-01 05:00:00',
               '2015-07-01 06:00:00', '2015-07-01 07:00:00',
               '2015-07-01 08:00:00', '2015-07-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')

## 데이터 추가

1.   concat(이어 붙이기)
2.   merge(합치기)

* concat은 단순하게 이어 붙이는 반면, merge는 중복되는 것을 자동으로 key로 사용할 수 있음.



In [94]:
def make_df(cols, ind): #설명 편리
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))
class display(object): # 출력 편리
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)
    

### 1) concat

In [95]:
# numpy 에서는 concatenate 사용 (np.concatenate([x, y], axis = ?))
# pandas 에서는 concat 사용 (pd.concat([x, y]))
ser1 = pd.Series(['A', 'B', 'C'], index=[1,2,3])
ser2 = pd.Series(['D', 'E',' F'], index=[4,5,6])
pd.concat([ser1, ser2])

1     A
2     B
3     C
4     D
5     E
6     F
dtype: object

In [96]:
df1 = pd.DataFrame({'A':['A1','A2'], 'B':['B1','B2']}, index=[1, 2])
df2 = pd.DataFrame({'A':['A3','A4'], 'B':['B3','B4']}, index=[3,4])
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [97]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
display('df3', 'df4', "pd.concat([df3, df4], axis=1)")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


#### Index 중복 관련

In [98]:
# index 중복 가능
x = make_df('AB', [0,1])
y = make_df('AB', [2,3])
y.index = x.index
display('x', 'y', 'pd.concat([x,y])')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [99]:
# 중복 없애기 가능
display('x', 'y','pd.concat([x,y], ignore_index=True)')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


#### 새로운 key 값으로 묶기

In [100]:
# keys 값 설정 통해 MultiIndex add 가능
display('x', 'y', "pd.concat([x,y], keys=['x','y'])")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


#### 결합 열 불일치

In [101]:
df5 = make_df('ABC', [1,2])
df6 = make_df('BCD', [3,4])
display('df5', 'df6', 'pd.concat([df5, df6])')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [102]:
# NaN 해결
# outer : 합집합(default)
# inner : 교집합
display('df5', 'df6', "pd.concat([df5,df6], join='inner')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


### 2) merge
* Default로 key 값들의 교집합이 결과로 나옴. => how keyword가 inner 라서 그럼

#### One - to - One joins

In [104]:
mf1 = pd.DataFrame({'employee' : ['Bob', 'Jake', 'Lisa', 'Sue'], 'group' : ['Accounting', 'Engineering', 'Engineering','HR']})
mf2 = pd.DataFrame({'employee':['Lisa','Bob', 'Jake', 'Sue'], 'hire_data': [2004, 2008, 2012, 2014]})
display('mf1','mf2')

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,employee,hire_data
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [105]:
# pd.merge() 는 각 DataFrame object가 'employee' 열을 가지고 있음을 파악하고 자동으로 이를 key로 사용.
# 이때 index 정보가 소실됨에 주의
mf3 = pd.merge(mf1, mf2)
mf3

Unnamed: 0,employee,group,hire_data
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


#### Many - to - One joins

In [106]:
mf4 = pd.DataFrame({'group':['Accounting', 'Engineering', 'HR'], 'supervisor':['Carly','Guido','Steve']})
display('mf3', 'mf4', 'pd.merge(mf3, mf4)')

Unnamed: 0,employee,group,hire_data
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve

Unnamed: 0,employee,group,hire_data,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


#### Many - to Many joins

In [107]:
mf5 = pd.DataFrame({'group' : ['Accounting', 'Accounting', 'Engineering', 'Engineering', 'HR', 'HR'], 'skills':['math','spreadsheets','coding','linux','spreadsheets','organization']})
display('mf1', 'mf5', 'pd.merge(mf1,mf5)')

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


#### 원하는 key의 데이터를 출력하기

##### key가 columns 인 경우

In [109]:
display('mf1','mf2', "pd.merge(mf1, mf2, on='employee')")
# on 사용 시 사용되는 두 DataFrame에 key가 모두 존재해야 함.

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,employee,hire_data
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014

Unnamed: 0,employee,group,hire_data
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [110]:
# 만약m 서로 다른 이름의 key로 사용하는 경우 left_on 과 right_on 사용
mf3_1 = pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'], 'salary':[70000, 80000, 120000, 90000]})
display('mf1','mf3_1', 'pd.merge(mf1, mf3_1, left_on="employee", right_on="name")')

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,80000
2,Lisa,Engineering,Lisa,120000
3,Sue,HR,Sue,90000


##### key가 index인 경우

In [112]:
# set_index() = 열 -> 행
# reset_index() = 행 -> 열
idf1 = mf1.set_index('employee')
idf2 = mf2.set_index('employee')
display('idf1', 'idf2')

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR

Unnamed: 0_level_0,hire_data
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014


In [113]:
display('idf1', 'idf2', "pd.merge(idf1, idf2, left_index=True, right_index=True)")

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR

Unnamed: 0_level_0,hire_data
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014

Unnamed: 0_level_0,group,hire_data
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [114]:
display('idf1', 'idf2', 'idf1.join(idf2)')

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR

Unnamed: 0_level_0,hire_data
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014

Unnamed: 0_level_0,group,hire_data
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


#### index 와 columns 혼합 사용



In [116]:
display('idf1', 'mf3_1', "pd.merge(idf1, mf3_1, left_index=True, right_on='name')")

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000

Unnamed: 0,group,name,salary
0,Accounting,Bob,70000
1,Engineering,Jake,80000
2,Engineering,Lisa,120000
3,HR,Sue,90000


#### 중복 columns의 등장

In [111]:
# drop() 이용하여 중복 제거
pd.merge(mf1, mf3_1, left_on="employee", right_on="name").drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


#### how keyword

In [117]:
hf6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                   columns=['name', 'food'])
hf7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                   columns=['name', 'drink'])
display('hf6', 'hf7', 'pd.merge(hf6, hf7)')

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [118]:
# default 로 key 값들의 교집합이 결과로 나옴.
# how keyword 변경해보기
pd.merge(hf6, hf7, how='inner')

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [120]:
display('hf6', 'hf7', "pd.merge(hf6, hf7, how='outer')")

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


In [121]:
display('hf6', 'hf7', "pd.merge(hf6, hf7, how='left')")

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine


In [122]:
display('hf6', 'hf7', "pd.merge(hf6, hf7, how='right')")

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer

Unnamed: 0,name,food,drink
0,Mary,bread,wine
1,Joseph,,beer


### join 이후 column에 중복이 있는 경우 접미어 사용

In [123]:
mf8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
mf9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
display('mf8', 'mf9', 'pd.merge(mf8, mf9, on="name")')

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2

Unnamed: 0,name,rank_x,rank_y
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [125]:
# suffixes keyword 사용하여 접미사 명시 가능
display('mf8', 'mf9', 'pd.merge(mf8, mf9, on="name", suffixes=["_L", "_R"])')

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2

Unnamed: 0,name,rank_L,rank_R
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


## 계층적 인덱싱(다차원 pandas)

1.   Series
2.   DataFrame
3.   명시적 MultiIndex



### 1) Series

In [127]:
index = [('California', 2000), ('California',2010), ('New York',2000),('New York', 2010), ('Texas', 2000), ('Texas', 2010)]
population = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
pop = pd.Series(population, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [128]:
pop.reindex(index) # 인덱스 값을 row로

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [129]:
index_pro = pd.MultiIndex.from_tuples(index)
index_pro

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [141]:
pop_pr = pop.reindex(index_pro)
pop_pr

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [142]:
# 이제 데이터를 편리하게 접근 가능
pop_pr[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

#### Indexed and Slicing

In [160]:
pop_pr

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [161]:
pop_pr['California', 2000]

33871648

In [163]:
pop_pr['California']

year
2000    33871648
2010    37253956
dtype: int64

In [164]:
pop_pr.loc['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [165]:
pop_pr[:, 2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [166]:
pop_pr[['California', 'Texas']] #Fancy Indexing

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

### Series와 DataFrame 상호 변환

1.   unstack() 과 stack()



In [167]:
pop_df.stack()

state       year         
California  2000  total      33871648
                  under18     9267089
            2010  total      37253956
                  under18     9284094
New York    2000  total      18976457
                  under18     4687374
            2010  total      19378102
                  under18     4318033
Texas       2000  total      20851820
                  under18     5906301
            2010  total      25145561
                  under18     6879014
dtype: int64

In [144]:
pop_df = pop_pr.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


### 2) DataFrame

In [146]:
pop_df = pd.DataFrame({'total':pop_pr, 'under18':[9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [149]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


##### 생성

In [151]:
# list 이용
hf = pd.DataFrame(np.random.rand(4,2), index=[['a','a','b','b'],[1,2,1,2]], columns=['data1','data2'])
hf

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.079406,0.092059
a,2,0.529204,0.828088
b,1,0.943717,0.203604
b,2,0.838694,0.39801


In [152]:
# dictionary를 이용한 Series
hdata = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(hdata)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

##### MultiIndex 이름 붙이기

In [153]:
pop_pr.index.names = ['state', 'year']
pop_pr

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

#### Indexed and Slicing

In [168]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,17.0,39.5,28.0,38.7,45.0,37.3
2013,2,48.0,36.9,37.0,36.9,41.0,37.7
2014,1,59.0,35.8,34.0,37.1,36.0,36.4
2014,2,28.0,38.4,21.0,39.2,26.0,37.8


In [170]:
# DataFrame에서는 column이 우선
health_data['Guido','HR']

year  visit
2013  1        28.0
      2        37.0
2014  1        34.0
      2        21.0
Name: (Guido, HR), dtype: float64

In [171]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,17.0,39.5
2013,2,48.0,36.9


In [172]:
# row 또는 column에서 계층인덱싱을 할 때에는 tuple 사용
health_data.loc[:, ('Bob','HR')]

year  visit
2013  1        17.0
      2        48.0
2014  1        59.0
      2        28.0
Name: (Bob, HR), dtype: float64

### 3) 명시적 MultiIndex

In [154]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [155]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [156]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [157]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

#### Columns MutliIndex

In [158]:
index = pd.MultiIndex.from_product([[2013, 2014], [1,2]], names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']], names=['subject','type'])

mdata = np.round(np.random.randn(4,6), 1) # round = n번째 자리에서 반올림. 여기서 n=1
mdata[:, ::2] *= 10
mdata += 37

health_data = pd.DataFrame(mdata, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,17.0,39.5,28.0,38.7,45.0,37.3
2013,2,48.0,36.9,37.0,36.9,41.0,37.7
2014,1,59.0,35.8,34.0,37.1,36.0,36.4
2014,2,28.0,38.4,21.0,39.2,26.0,37.8


In [159]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,28.0,38.7
2013,2,37.0,36.9
2014,1,34.0,37.1
2014,2,21.0,39.2


### 데이터 재배치 in MultiIndex

In [173]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.665860
      2      0.015344
c     1      0.815334
      2      0.453711
b     1      0.774731
      2      0.902339
dtype: float64

In [174]:
# 인덱스가 정렬되어 있지 않다면 sort_index() 혹은 sortlevel()을 통해 정렬해야 함
data = data.sort_index()
data

char  int
a     1      0.665860
      2      0.015344
b     1      0.774731
      2      0.902339
c     1      0.815334
      2      0.453711
dtype: float64

In [175]:
data['a':'b']

char  int
a     1      0.665860
      2      0.015344
b     1      0.774731
      2      0.902339
dtype: float64

In [177]:
pop_pr

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [178]:
pop_pr.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [180]:
pop_pr.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [181]:
pop_pr.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

#### Index setting and resetting
* reset_index를 통해 index들을 column으로 변환 가능
* set_index를 통해 column을 index로 변환 가능



In [182]:
pop_flat = pop_pr.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [183]:
pop_flat.set_index(['state','year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


#### Aggregation

In [184]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,17.0,39.5,28.0,38.7,45.0,37.3
2013,2,48.0,36.9,37.0,36.9,41.0,37.7
2014,1,59.0,35.8,34.0,37.1,36.0,36.4
2014,2,28.0,38.4,21.0,39.2,26.0,37.8


In [185]:
data_mean = health_data.mean(level='year')
data_mean

  """Entry point for launching an IPython kernel.


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,32.5,38.2,32.5,37.8,43.0,37.5
2014,43.5,37.1,27.5,38.15,31.0,37.1


## Pivot Table

In [187]:
titanic = sns.load_dataset('titanic')

In [188]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [191]:
# 성별에 따른 생존율
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [193]:
# 성별과 등급에 따른 생존율
titanic.groupby(['sex','class'])['survived'].aggregate('mean')

sex     class 
female  First     0.968085
        Second    0.921053
        Third     0.500000
male    First     0.368852
        Second    0.157407
        Third     0.135447
Name: survived, dtype: float64

In [194]:
titanic.groupby(['sex','class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [195]:
# 위 두 예시는 groupby 이용
# 지금부터의 예시는 pivot table 사용
titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


### Multi-level Pivot_tables

In [196]:
age = pd.cut(titanic['age'], [0,18,80])
titanic.pivot_table('survived', ['sex', age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [199]:
# column 역시 muliple level로 변경 가능
fare = pd.qcut(titanic['fare'],2)
titanic.pivot_table('survived', ['sex',age], [fare,'class'])

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


### 대표 예시

In [200]:
titanic.pivot_table(index='sex',columns='class', aggfunc={'survived':sum, 'fare':'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


In [201]:
# total 구하기
titanic.pivot_table('survived', index='sex', columns='class', margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


## Missing Data

1.   mask 방식 - boolean array | bit 통해 표시
2.   sentinel value 방식 - 특수한 값으로 표시



### NaN value 찾기


In [202]:
ndt = pd.Series([1, np.nan, 'hello', None])
ndt

0        1
1      NaN
2    hello
3     None
dtype: object

In [204]:
ndt.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [205]:
ndt[ndt.notnull()]

0        1
2    hello
dtype: object

### NaN 처리하기

In [206]:
ndt.dropna()

0        1
2    hello
dtype: object

In [207]:
ndt.fillna('False')

0        1
1    False
2    hello
3    False
dtype: object

In [208]:
# DataFrame에서는 행 또는 열 전체 삭제
ndf = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
ndf

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [209]:
ndf.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [210]:
ndf.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [211]:
ndf.fillna('trash')

Unnamed: 0,0,1,2
0,1.0,trash,2
1,2.0,3.0,5
2,trash,4.0,6


In [212]:
# 삭제 조건
ndf[3] = np.nan
ndf

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [213]:
ndf.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [215]:
ndf.dropna(axis='rows', thresh=3) # 행을 기준으로 3개 미만 입력이면 그 행 삭제

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [216]:
ndf.fillna(method='ffill') # NaN 앞에 있는 것으로 채우기

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [217]:
ndf.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,1.0,3.0,2,
1,2.0,3.0,5,
2,,4.0,6,
