In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Pandas 2
## <font color=red>집계와 분류</font>
하나의 값으로 대용량 데이터세트의 기본 특성에 대한 통찰력을 제공

### Pandas의 간단한 Aggregation 연산

> Series의 경우

In [2]:
# 1. 랜덤 샘플을 생성하자.
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
display(ser)

# 2. 내부적으로 선언된 집계 함수를 사용하자.
display(ser.sum())
display(ser.mean())

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

2.811925491708157

0.5623850983416314

> Pandas의 경우

In [3]:
# 1. 랜덤 샘플을 생성하자.
df = pd.DataFrame({'A': rng.rand(5),'B': rng.rand(5)})
display(df)

# 2. 내부적으로 선언된 집계 함수를 사용하자. 각 컬럼에 대한 평균값을 반환한다. 디폴트값이다.
display(df.mean())

# 3. 각 row에 대한 평균을 내보자.
display(df.mean(axis='columns'))

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


A    0.477888
B    0.443420
dtype: float64

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

> Seaborn 제공하는 data를 사용해보자.

In [4]:
import seaborn as sns
planets = sns.load_dataset('planets')

# 데이터의 형상을 보자
display(planets.shape)

# 데이터의 샘플을 보자
display(planets.head())

(1035, 6)

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [5]:
# 데이터의 집계값을 간단하게 살펴보자.
display(planets.dropna().describe())

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


### <font color=red> GroupBy : 분할, 적용, 결합</font>
- 분할 : 지정된 키 값을 기준으로 DataFrame을 나누고 분류하는 단계다.
- 적용 : 개별 그룹 내에서 일반적으로 집계, 변환, 필터링 같은 함수를 계산한다.
- 결합 : 이 연산의 결과를 결과 배열에 병합한다.

> **분할, 적용, 결합**

In [6]:
# 1. DataFrame을 생성한다.
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], 
                   'data': range(6)},
                 columns = ['key', 'data'])

display(df)

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [7]:
# 2. Key를 기준으로 groupby를 하자.
df.groupby('key')

<pandas.core.groupby.DataFrameGroupBy object at 0x1a131ad6a0>

In [8]:
# 3. key를 기준으로 sum을 하자.
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


> **GourpBy 객체**

In [9]:
# 1. 파이썬에서 제공하는 데이터를 다시 한 번 보자. method를 key로 사용한다.
planets.groupby('method').median()

Unnamed: 0_level_0,number,orbital_period,mass,distance,year
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Astrometry,1.0,631.18,,17.875,2011.5
Eclipse Timing Variations,2.0,4343.5,5.125,315.36,2010.0
Imaging,1.0,27500.0,,40.395,2009.0
Microlensing,1.0,3300.0,,3840.0,2010.0
Orbital Brightness Modulation,2.0,0.342887,,1180.0,2011.0
Pulsar Timing,3.0,66.5419,,1200.0,1994.0
Pulsation Timing Variations,1.0,1170.0,,,2007.0
Radial Velocity,1.0,360.2,1.26,40.445,2009.0
Transit,1.0,5.714932,1.47,341.0,2012.0
Transit Timing Variations,2.0,57.011,,855.0,2012.5


In [10]:
# 2. 'orbital_period' column을 선택
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [11]:
# 3. GroupBy 객체는 그룹을 직접 순회할 수 있다.
# 각 메서드에 대하여 속하는 그룹을 리턴한다.
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [12]:
# 4. 각 group을 설명하는 일련의 집계 연산을 수행할 수 있다.
# groupby 후, year 컬럼에 대해서 일련의 집계 연산을 수행하는 메소드.
# 멀티인덱스를 row, col으로 나누어주거나, row, col을 합쳐서 멀티 인덱스를 만들어준다.
planets.groupby('method')['year'].describe().unstack()

       method                       
count  Astrometry                          2.000000
       Eclipse Timing Variations           9.000000
       Imaging                            38.000000
       Microlensing                       23.000000
       Orbital Brightness Modulation       3.000000
       Pulsar Timing                       5.000000
       Pulsation Timing Variations         1.000000
       Radial Velocity                   553.000000
       Transit                           397.000000
       Transit Timing Variations           4.000000
mean   Astrometry                       2011.500000
       Eclipse Timing Variations        2010.000000
       Imaging                          2009.131579
       Microlensing                     2009.782609
       Orbital Brightness Modulation    2011.666667
       Pulsar Timing                    1998.400000
       Pulsation Timing Variations      2007.000000
       Radial Velocity                  2007.518987
       Transit             

> **집계, 필터, 변환, 적용**

In [13]:
# 랜덤으로 DataFrame을 생성한다.
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                  'data1': range(6),
                  'data2': rng.randint(0, 10, 6),},
                 columns = ['key', 'data1', 'data2'])

df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [14]:
"""집계 : aggregate()""" 

# 1) 직접 함수를 리스트의 원소로 넣어도 되고, 문자열로 넣어도 된다.
# data1/data2 에 동시에 적용
display(df.groupby('key').aggregate(['min', np.median, max]))

# 2) 딕셔너리를 전달하는 경우. data에 대해서 각각 지정이 가능
display(df.groupby('key').aggregate(
        {'data1': [min, np.mean], 'data2': 'max'}))

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


Unnamed: 0_level_0,data1,data1,data2
Unnamed: 0_level_1,min,mean,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,0,1.5,5
B,1,2.5,7
C,2,3.5,9


In [15]:
"""필터링 : filter(), 그룹 속성을 기준으로 데이터를 걸러낼 수 있다."""

# std가 4보다 큰 그룹을 filter 한다.
def filter_func(x):
    return x['data2'].std() > 4

display(df)
display(df.groupby('key').std())

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641


In [16]:
# 그룹 B, C에 속한 데이터만 return 할 것이다. filter 함수는 부울 값을 반환한다.
display(df.groupby('key').filter(filter_func))
display(df.groupby('key').filter(lambda x: x['data2'].std() > 4))

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


In [17]:
"""변환 : trnasformation"""

display(df)

# 그룹에 따른 평균을 구한다. 
# 즉 그룹 A의 data1, data2의 평균을 각각 구하고 데이터에서 평균을 뺀다.
# 정규화에 사용되면 좋을 것이다.
display(df.groupby('key').transform(lambda x: x - x.mean()))

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [18]:
"""apply - 임의의 함수를 그룹 결과에 적용할 때 사용"""

# 같은 그룹에 속하는 data2의 합으로 data1의 값을 각각 나눈다.
def norm_by_data2(x):
    # x는 그룹 값을 가지는 DataFrame
    x['data1'] /= x['data2'].sum()
    return x

display(df)
display(df.groupby('key').apply(norm_by_data2))

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0,key,data1,data2
0,A,0.0,5
1,B,0.142857,0
2,C,0.166667,3
3,A,0.375,3
4,B,0.571429,7
5,C,0.416667,9


> **분할 Key를 따로 지정**
- 하나의 열 이름을 기준으로 DataFrame을 분할하였는데, 분할키를 지정하는 방식을 살펴보자

In [19]:
# 1. 분할키를 리스트로 지정한다.
L = [0, 1, 0, 1, 2, 0]
display(df)

# 0 - index 0, 2, 5 원소의 sum
# 1 - index 1, 3 원소의 sum
# 2 - index 4 원소의 sum
display(df.groupby(L).sum())

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [20]:
# 2. 분할키를 컬럼이름이 아닌 DataFrame을 직접 줄 수도 있다.
df.groupby(df['key']).sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,8
B,5,7
C,7,12


In [21]:
# 3. 인덱스 값을 그룹 키에 매핑하는 딕셔너리를 제공
# 키를 지정한다.
df2 = df.set_index('key')
display(df2)

# A는 vowel로 매핑되고, B, C는 consonant로 묶여서 매핑된다.
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}

display(df2.groupby(mapping).sum())

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9


Unnamed: 0,data1,data2
consonant,12,19
vowel,3,8


In [22]:
# 4. 파이썬 함수를 전달한다.
df2.groupby(str.lower).mean()

Unnamed: 0,data1,data2
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


> **분류(Grouping) 문제**

In [23]:
# year를 10으로 나눈다 : 세기를 정하려는 것.
decade = 10 * (planets['year'] // 10)
# decade의 이름
decade = decade.astype(str) + 's'

# decade의 name key에 접근
decade.name = 'decade'

# 각 연도구간별로 행성이 발견된 횟수를 구할 수 있다.
display(planets.groupby(['method', decade])['number'].sum())
display(planets.groupby(['method', decade])['number'].sum().unstack().fillna(0))

method                         decade
Astrometry                     2010s       2
Eclipse Timing Variations      2000s       5
                               2010s      10
Imaging                        2000s      29
                               2010s      21
Microlensing                   2000s      12
                               2010s      15
Orbital Brightness Modulation  2010s       5
Pulsar Timing                  1990s       9
                               2000s       1
                               2010s       1
Pulsation Timing Variations    2000s       1
Radial Velocity                1980s       1
                               1990s      52
                               2000s     475
                               2010s     424
Transit                        2000s      64
                               2010s     712
Transit Timing Variations      2010s       9
Name: number, dtype: int64

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


## <font color=red> 피벗 테이블</font>
- 피벗 테이블은 입력값으로 간단한 열 단위의 데이터를 취하고, 그 데이터에 대한 다차원 요약을 제공하는 2차원 테이블로 항목을 그룹핑한다.
- 피벗 테이블은 근본적으로 GroupBy 집계의 다차원 버전이다.

In [24]:
# 1. 타이타닉의 승객 데이터베이스를 사용할 것이다.
import seaborn as sns
titanic = sns.load_dataset('titanic')

# 2. 타이타닉 데이터의 샘플을 보자.
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [25]:
# 3. 성별에 따른 생존율을 GroupBy 연산을 사용하여 구해보자.
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [26]:
# 4. 성별과 좌석 등급별 생존율을 GroupBy 연산을 사용하여 구해보자.
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [27]:
# 5. 피벗 테이블을 사용하여 4번에서 구했던 내용을 간단하게 구해보자.
titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


> **다단계 피벗 테이블**

In [28]:
# 1. 연령을 추가하여 성별, 좌석 등급, 연령의 3차원으로 보자. 연령은 2개의 구간으로 나누었다.
age = pd.cut(titanic['age'], [0, 18, 80])

# 3차원 테이블이다.
display(titanic.pivot_table('survived', ['sex', age], 'class'))

# sum
display(titanic.pivot_table('survived', 
                            ['sex', age], # index
                            'class', # column
                            aggfunc=np.sum))

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",10,14,22
female,"(18, 80]",72,54,25
male,"(0, 18]",4,9,11
male,"(18, 80]",36,6,27


In [29]:
# 2. 열 기준으로 동작할 때는 pd.qcut 함수를 사용한다. 임으로 2개로 나누었다.
fare = pd.qcut(titanic['fare'], 2)

# 4차원 테이블이다.
titanic.pivot_table('survived', 
                    ['sex', age], # index
                    [fare, 'class']) # column

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


> **기타 피벗 테이블 옵션**

In [30]:
# 1. aggfunc 옵션
titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived': sum, 'fare': 'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


In [31]:
# 2. 그룹별 총합을 계산하는 옵션
titanic.pivot_table('survived', 
                    index='sex', 
                    columns='class', 
                    margins=True) # default : mean

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


## 벡터화된 문자열 연산

### <font color=red>Pandas 문자열 연산</font>
- Pandas는 문자열을 담고 있는 Pandas Series와 Index 객체의 `str` 속성을 통해 벡터화된 문자열 연산을 수행하고 누락된 데이터를 올바르게 처리하기 위한 기능을 제공

In [32]:
# 1. 테스트를 위한 데이터 세트 생성
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)

names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [33]:
# 2. 누락된 값은 건너뛰면서 모든 항목의 첫 글자를 대문자로 변경하는 메소드를 호출
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

> **문자열 메서드들**

In [34]:
# 테스트를 위한 데이터 세트 생성
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                  'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [35]:
"""lower()"""

monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [36]:
"""len()"""

monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [37]:
"""startswith()"""

monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [38]:
"""split()"""

display(monte.str.split())
display(monte.str.split().str.get(-1))

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [39]:
"""슬라이싱"""

monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

> **정규 표현식을 활용하는 메서드**

In [40]:
# 각 요소의 시작 문자와 붙어 있는 그룹을 요청해 각 요소로부터 이름 부분을 추출
monte.str.extract('([A-Za-z]+)')

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [41]:
# 문자열 시작(^)과 문자열 끝($)을 나타내는 정규 표현식을 사용해 자음으로 시작하고 끝나는 모든 이름을 찾아보자.
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

## 시계열 다루기
- 타임스탬프 : 특정 시점을 말한다.
- 시간 간격, 기간 : 특정 시작점과 종료점 사이의 시간의 길이를 말한다.
- 시간 델타(time delta)나 지속 기간(duration) : 정확한 시간 길이를 말한다.

### 파이썬에서의 날짜와 시간

> **기본 파이썬 날짜와 시간: datetime, dateutil**

In [42]:
from datetime import datetime
datetime(year=2018, month=2, day=4)

datetime.datetime(2018, 2, 4, 0, 0)

In [43]:
# 날짜를 해석할 수 있다.
from dateutil import parser
date = parser.parse("4th of July, 2015")

date

datetime.datetime(2015, 7, 4, 0, 0)

In [44]:
# 요일을 출력하는 작업
date.strftime('%A')

'Saturday'

### 타입이 지정된 시간 배열: NuPy의 datetime64
- 64비트 정밀도에 제한되기 때문에 이 유형의 연산이 파이썬의 datetime 객체로 직접 작업하는 것보다 훨씬 더 빨리 수행될 수 있다.
- 시간 부해능과 최대 시간 사이의 절충점을 도입한다.

In [45]:
import numpy as np
date = np.array('2018-02-04', dtype=np.datetime64)

date

array('2018-02-04', dtype='datetime64[D]')

In [46]:
# 주어진 날짜로부터 12일 후까지의 날짜를 가지고 온다.
date + np.arange(12)

array(['2018-02-04', '2018-02-05', '2018-02-06', '2018-02-07',
       '2018-02-08', '2018-02-09', '2018-02-10', '2018-02-11',
       '2018-02-12', '2018-02-13', '2018-02-14', '2018-02-15'],
      dtype='datetime64[D]')

### Pandas에서의 날짜와 시: 두 세계의 최선
- datetime과 dateutil의 사용 편의성과 numpy.datetime64의 효율적인 저장소와 벡터화된 인터페이스를 결합

In [47]:
date = pd.to_datetime("4th of July, 2015")
date

Timestamp('2015-07-04 00:00:00')

In [48]:
date.strftime('%A')

'Saturday'

In [49]:
date + pd.to_timedelta(np.arange(12), 'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)

## Pandas 시계열: 시간으로 인덱싱하기

In [50]:
# 타임스탬프로 데이터를 인덱싱하기
index = pd.DatetimeIndex(['2014-07-04', 
                          '2014-08-04', 
                          '2015-07-04', 
                          '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)

display(data)

# 타임스탬프를 이용하여 슬라이싱 하기
display(data['2014-07-04':'2015-07-04'])

# 연도수를 이용하여 데이터를 불러오기
display(data['2015'])

2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64

2015-07-04    2
2015-08-04    3
dtype: int64

### DatetimeIndex를 생성하기

In [51]:
# 1. 다양한 형태로 datetime index를 생성해보자.
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                       '2015-Jul-6', '07-07-2015', '20150708'])

dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [52]:
# 2. 함수에 주기 코드를 추가해 PeriodIndex로 전환
dates.to_period('D')

PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]', freq='D')

In [53]:
# 3. 어떤 날자에서 다른 날짜를 빼보자.
dates - dates[0]

TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

### 정규 시퀀스 : `pd.date_range()`

In [54]:
# 시작점과 종료점을 준다.
display(pd.date_range('2015-07-03', '2015-07-10'))

# 시작점과 기간을 준다.
display(pd.date_range('2015-07-03', periods=8))

# 시작점, 기간 및 단위를 준다.
display(pd.date_range('2015-07-03', periods=8, freq='H'))

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')

## 고성능 Pandas: eval()과 query()
- NumPy의 벡터화/브로드캐스팅된 연산과 Pandas의 그룹화 유형의 연산은 일반적인 경우에 대체로 효율적이고 효과적이기는 하지만 임시 중간 객체 생성에 의존하는 경우가 종종 있어 계산 시관과 메모리 사용에 과도한 오버헤드를 일으킬 수 있다.
- 2014년 1월에 출시된 0.13 버전을 기준으로 Pandas는 비용이 많이 드는 중간 배열의 할당 없이 속도가 빠른 C연산에 직접 접근할 수 있는 실험적인 도구

### `eval()`

In [55]:
# 1. 테스트 셋을 만들어보자.
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) 
                      for i in range(4))

In [56]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.37454,0.950714,0.731994,0.598658,0.156019,0.155995,0.058084,0.866176,0.601115,0.708073,...,0.119594,0.713245,0.760785,0.561277,0.770967,0.493796,0.522733,0.427541,0.025419,0.107891
1,0.031429,0.63641,0.314356,0.508571,0.907566,0.249292,0.410383,0.755551,0.228798,0.07698,...,0.093103,0.897216,0.900418,0.633101,0.33903,0.34921,0.725956,0.89711,0.887086,0.779876
2,0.642032,0.08414,0.161629,0.898554,0.606429,0.009197,0.101472,0.663502,0.005062,0.160808,...,0.0305,0.037348,0.822601,0.360191,0.127061,0.522243,0.769994,0.215821,0.62289,0.085347
3,0.051682,0.531355,0.540635,0.63743,0.726091,0.975852,0.5163,0.322956,0.795186,0.270832,...,0.990505,0.412618,0.372018,0.776413,0.340804,0.930757,0.858413,0.428994,0.750871,0.754543
4,0.103124,0.902553,0.505252,0.826457,0.32005,0.895523,0.389202,0.010838,0.905382,0.091287,...,0.455657,0.620133,0.277381,0.188121,0.463698,0.353352,0.583656,0.077735,0.974395,0.986211


In [57]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.926538,0.382461,0.871469,0.761471,0.328826,0.988821,0.120738,0.358905,0.954462,0.004711,...,0.310465,0.816988,0.930747,0.111477,0.772517,0.801181,0.466825,0.005912,0.70511,0.487674
1,0.715167,0.490948,0.904532,0.319521,0.582585,0.98033,0.019068,0.089363,0.281105,0.143648,...,0.433028,0.13254,0.263659,0.339079,0.234842,0.507921,0.544545,0.197424,0.432392,0.218104
2,0.975796,0.049902,0.092684,0.158453,0.858309,0.65255,0.681106,0.360168,0.843117,0.619341,...,0.156821,0.772316,0.412088,0.796167,0.54858,0.722526,0.141587,0.459266,0.128221,0.661666
3,0.369458,0.911366,0.892686,0.763454,0.581681,0.207756,0.024249,0.92586,0.191849,0.047043,...,0.313598,0.566552,0.844425,0.079068,0.33843,0.921877,0.856621,0.285027,0.505441,0.571166
4,0.794953,0.714644,0.652743,0.639999,0.801813,0.223324,0.468607,0.409739,0.846211,0.488558,...,0.349061,0.986111,0.389271,0.42801,0.645183,0.998789,0.805533,0.310009,0.876316,0.946936


In [58]:
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.381785,0.88428,0.45058,0.889203,0.400178,0.329899,0.37492,0.289165,0.856012,0.170531,...,0.232548,0.42708,0.687908,0.990223,0.532107,0.29157,0.604532,0.510344,0.178462,0.816248
1,0.575967,0.057404,0.320802,0.174745,0.708598,0.165073,0.8523,0.841379,0.810541,0.867123,...,0.205949,0.18309,0.481792,0.47993,0.36037,0.920427,0.515166,0.698365,0.925812,0.272917
2,0.553476,0.657017,0.72186,0.058866,0.818086,0.882324,0.633707,0.786487,0.107093,0.659608,...,0.725832,0.627375,0.387747,0.20446,0.973627,0.26253,0.912395,0.852041,0.050451,0.668992
3,0.84181,0.738977,0.768721,0.352721,0.454399,0.91565,0.164899,0.872948,0.419942,0.671492,...,0.917378,0.928159,0.034869,0.679377,0.351755,0.23352,0.620001,0.338868,0.797963,0.447284
4,0.069417,0.37045,0.329881,0.88214,0.688254,0.393034,0.288496,0.248113,0.835122,0.668993,...,0.723569,0.378604,0.294903,0.595871,0.940018,0.544825,0.030322,0.157838,0.364742,0.932007


In [59]:
df4.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.773222,0.02287,0.135256,0.547153,0.112734,0.382484,0.282446,0.47939,0.973054,0.968159,...,0.742549,0.014689,0.638707,0.557382,0.935098,0.161364,0.792444,0.789514,0.522443,0.575358
1,0.608169,0.141071,0.560629,0.028672,0.017801,0.92825,0.939959,0.865063,0.125569,0.062302,...,0.563912,0.085168,0.545653,0.062591,0.079648,0.904816,0.570289,0.112442,0.18727,0.167751
2,0.79028,0.450114,0.316514,0.443655,0.961636,0.18353,0.092308,0.563372,0.137717,0.493172,...,0.943914,0.999072,0.656912,0.87979,0.801385,0.020247,0.27461,0.013139,0.884154,0.128746
3,0.062328,0.129402,0.951153,0.674908,0.706534,0.06913,0.331226,0.421508,0.578126,0.67481,...,0.026716,0.690321,0.373365,0.361318,0.044817,0.219551,0.684745,0.104272,0.996603,0.25626
4,0.13967,0.010372,0.683865,0.662876,0.593358,0.27329,0.001748,0.173523,0.578557,0.084194,...,0.472835,0.914288,0.634008,0.922544,0.063849,0.203463,0.805392,0.09748,0.733605,0.278122


In [60]:
# 2. 네 개의 DataFrame 모두의 합을 계산해보자 (고전적인 방법)
%timeit df1 + df2 + df3 + df4

76.6 ms ± 2.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [61]:
# 3. eval를 사용해 계산해보자.
%timeit pd.eval('df1 + df2 + df3 + df4')

41.4 ms ± 1.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


> **산술 연산자**

In [62]:
# 고전적인 방법으로 산술연산을 진행
%timeit df1 * df2 / (df3 + df4)

75.8 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [63]:
# eval을 사용
%timeit pd.eval('df1 * df2 / (df3 + df4)')

42.5 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### `query()`

In [64]:
# 테스트 데이터 셋을 생성한다.
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.615875,0.525167,0.047354
1,0.330858,0.412879,0.441564
2,0.689047,0.559068,0.23035
3,0.290486,0.695479,0.852587
4,0.42428,0.534344,0.245216


In [65]:
# 1-1. 0.5보다 작은 데이터를 찾는 고전적인 방법 즉 마스킹을 이용하는 방법
df[(df.A < 0.5) & (df.B < 0.5)].head()

Unnamed: 0,A,B,C
1,0.330858,0.412879,0.441564
8,0.448611,0.415924,0.481001
10,0.11291,0.394884,0.950129
11,0.191011,0.118751,0.130223
14,0.075723,0.260648,0.956146


In [66]:
# 1-2. 0.5보다 작은 데이터를 찾기 위해 Query를 사용
df.query('A < 0.5 and B < 0.5').head()

Unnamed: 0,A,B,C
1,0.330858,0.412879,0.441564
8,0.448611,0.415924,0.481001
10,0.11291,0.394884,0.950129
11,0.191011,0.118751,0.130223
14,0.075723,0.260648,0.956146


In [67]:
# 2-1 고전적인 방법
Cmean = df['C'].mean()
display(Cmean)

df[(df.A < Cmean) & (df.B < Cmean)].head()

0.5104765814526574

Unnamed: 0,A,B,C
1,0.330858,0.412879,0.441564
8,0.448611,0.415924,0.481001
10,0.11291,0.394884,0.950129
11,0.191011,0.118751,0.130223
14,0.075723,0.260648,0.956146


In [68]:
# 2-2 Query를 사용
df.query('A < @Cmean and B < @Cmean').head()

Unnamed: 0,A,B,C
1,0.330858,0.412879,0.441564
8,0.448611,0.415924,0.481001
10,0.11291,0.394884,0.950129
11,0.191011,0.118751,0.130223
14,0.075723,0.260648,0.956146


`eval`이나 `query`는 언제 써야 할까? 메모리를 절약하는데 사용한다. 작은 배열에서는 오히려 전형적인 메서드가 더 빠르다.