# Chapter 6. 데이터 인덱싱

In [2]:
import numpy as np
import pandas as pd

import datetime
from datetime import datetime, date

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)

In [3]:
sp500 = pd.read_csv('./data/Learning-Pandas-Second-Edition-master/data/sp500.csv',
                    index_col = 'Symbol',
                    usecols = [0, 2, 3, 7])

### 인덱스

In [9]:
np.random.seed(123456)

df = pd.DataFrame({
                    'foo': np.random.random(10000),
                    'key': range(100, 10100)
})
display(df.head())

Unnamed: 0,foo,key
0,0.12697,100
1,0.966718,101
2,0.260476,102
3,0.897237,103
4,0.37675,104


In [10]:
# 논리연산과 인덱스 사용의 탐색 시간 비교
%timeit df.loc[df['key'] == 10099]

272 µs ± 7.21 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
df_with_index = df.set_index(['key'])

%timeit df_with_index.loc[10099]

40.4 µs ± 956 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


#### index 유형

In [13]:
temps = pd.DataFrame(
                        {'City': ['Missoula', 'Philadelphia'],
                         'Temperature': [70, 80]}
)

display(temps)
print(temps.columns)

Unnamed: 0,City,Temperature
0,Missoula,70
1,Philadelphia,80


Index(['City', 'Temperature'], dtype='object')


#### 정수 유형 (Int64Index, RangeIndex)

In [15]:
df_i64 = pd.DataFrame(np.arange(10, 20), index = np.arange(0, 10))

display(df_i64.head())
print(df_i64.index)

Unnamed: 0,0
0,10
1,11
2,12
3,13
4,14


Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')


In [4]:
df_range = pd.DataFrame(np.arange(10, 15))

display(df_range.head())
print(df_range.index)

Unnamed: 0,0
0,10
1,11
2,12
3,13
4,14


RangeIndex(start=0, stop=5, step=1)


#### 부동소수점 유형 (Float64Index)

In [6]:
df_f64 = pd.DataFrame(np.arange(0, 1000, 5), # (시작점, 끝점, 간격)
                      np.arange(0.0, 100.0, 0.5))

display(df_f64.iloc[:5])
print(df_f64.index)

Unnamed: 0,0
0.0,0
0.5,5
1.0,10
1.5,15
2.0,20


Float64Index([ 0.0,  0.5,  1.0,  1.5,  2.0,  2.5,  3.0,
               3.5,  4.0,  4.5,
              ...
              95.0, 95.5, 96.0, 96.5, 97.0, 97.5, 98.0,
              98.5, 99.0, 99.5],
             dtype='float64', length=200)


#### 이산 간격 유형 (IntervalIndex) - 어떤 경우에 쓸 수 있을까?

In [7]:
df_interval = pd.DataFrame({'A': [1, 2, 3, 4]},
                           index = pd.IntervalIndex.from_breaks([0, 0.5, 1.0, 1.5, 2.0]))

display(df_interval)
print(df_interval.index)

Unnamed: 0,A
"(0.0, 0.5]",1
"(0.5, 1.0]",2
"(1.0, 1.5]",3
"(1.5, 2.0]",4


IntervalIndex([(0.0, 0.5], (0.5, 1.0], (1.0, 1.5], (1.5, 2.0]], dtype='interval[float64, right]')


#### 범주형 (CategoricalIndex)

In [22]:
df_categorical = pd.DataFrame({'A': np.arange(6),
                               'B': list('aabbca')})
# df_categorical['B'] = df_categorical['B'].astype('category',
#                                                  categories = list('cab'))
df_categorical['B'] = pd.Categorical(df_categorical['B'], categories = list('cab'))
df_categorical = df_categorical.set_index('B')

display(df_categorical)
print(df_categorical.index)

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
b,2
b,3
c,4
a,5


CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, dtype='category', name='B')


#### 날짜 및 시간 유형 (DatetimeIndex)

In [34]:
rng = pd.date_range('20230811', periods = 5, freq = 'H')

print(rng)

DatetimeIndex(['2023-08-11 00:00:00',
               '2023-08-11 01:00:00',
               '2023-08-11 02:00:00',
               '2023-08-11 03:00:00',
               '2023-08-11 04:00:00'],
              dtype='datetime64[ns]', freq='H')


In [40]:
pd.DatetimeIndex(rng) # 굳이 이렇게 하지 않아도 pd.date_range 자체로 인덱스 형태로 생성된다.

DatetimeIndex(['2023-08-11 00:00:00',
               '2023-08-11 01:00:00',
               '2023-08-11 02:00:00',
               '2023-08-11 03:00:00',
               '2023-08-11 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [35]:
time_series = pd.Series(np.random.randn(len(rng)), index = rng)

print(time_series)

2023-08-11 00:00:00    0.658593
2023-08-11 01:00:00    0.760640
2023-08-11 02:00:00   -0.407708
2023-08-11 03:00:00   -0.258351
2023-08-11 04:00:00   -0.883316
Freq: H, dtype: float64


#### 기간 유형 (PeriodIndex)

In [36]:
periods = pd.PeriodIndex(['2023-06', '2023-07', '2023-08'], freq = 'M')

print(periods)

PeriodIndex(['2023-06', '2023-07', '2023-08'], dtype='period[M]')


In [38]:
period_series = pd.Series(np.random.randn(len(periods)),
                          index = periods)

print(period_series)

2023-06   -0.144382
2023-07   -0.086361
2023-08    0.525438
Freq: M, dtype: float64


### 인덱스를 통한 데이터 선택

In [42]:
s = pd.Series(np.arange(0, 5), index = list('abcde'))

display(s)

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [44]:
# 동일한 기능
print(s['b'])
print(s.loc['b'])

1
1


In [45]:
df = pd.DataFrame([np.arange(10, 12),
                   np.arange(12, 14)],
                  columns = list('ab'),
                  index = list('vw'))

display(df)

Unnamed: 0,a,b
v,10,11
w,12,13


In [47]:
# 데이터 프레임에선 []와 .loc[]가 다르게 작동한다.
print(df['a']) # 컬럼 조회
# print(df.loc['a']) # 로우 조회 ('a' 로우가 없기 때문에 에러 발생)

print(df.loc['w'])

v    10
w    12
Name: a, dtype: int32
a    12
b    13
Name: w, dtype: int32


In [50]:
# 슬라이싱 - Series에서는 동일하게 작동
print(s['b':'d'])
print(s.loc['b':'d'])

b    1
c    2
d    3
dtype: int32
b    1
c    2
d    3
dtype: int32


In [65]:
# 데이터 프레임에서의 슬라이싱 - .lcc를 이용하면 컬럼 값 기준 슬라이싱 가능 / [] 로우 기준 슬라이싱 가능
display(df.loc[:, 'a':'a'])
display(df['v':'w'])

Unnamed: 0,a
v,10
w,12


Unnamed: 0,a,b
v,10,11
w,12,13


### 리인덱싱

In [69]:
reindexed = sp500.reindex(index = ['MMM', 'ABBV', 'FOO'])

display(reindexed)

# reindex 메서드를 통해 기존 df에서 원하는 인덱스만 추출하는 것도 가능하며, 없는 인덱스에 대해서는 빈 값으로 채워서 결과를 리턴한다.

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABBV,Health Care,53.95,2.954
FOO,,,


In [70]:
sp500.reindex(columns = ['Price', 'Book Value', 'NewCol'])

# 컬럼에 대해서도 사용 가능하다.

Unnamed: 0_level_0,Price,Book Value,NewCol
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,141.14,26.668,
ABT,39.60,15.573,
ABBV,53.95,2.954,
ACN,79.79,8.326,
ACE,102.91,86.897,
...,...,...,...
YHOO,35.02,12.768,
YUM,74.77,5.147,
ZMH,101.84,37.181,
ZION,28.43,30.191,


### 계층형 인덱스

In [82]:
reindexed = sp500.reset_index()
multi_fi = reindexed.set_index(['Sector', 'Symbol'])

display(multi_fi.head())
print(type(multi_fi.index))
print(len(multi_fi.index.levels))

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Book Value
Sector,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.6,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897


<class 'pandas.core.indexes.multi.MultiIndex'>
2


In [89]:
print(multi_fi.index[0])
print(multi_fi.index.levels[0])

('Industrials', 'MMM')
Index(['Consumer Discretionary', 'Consumer Discretionary ',
       'Consumer Staples', 'Consumer Staples ', 'Energy',
       'Financials', 'Health Care', 'Industrials',
       'Industries', 'Information Technology', 'Materials',
       'Telecommunications Services', 'Utilities'],
      dtype='object', name='Sector')


In [90]:
multi_fi.index.get_level_values(0)

Index(['Industrials', 'Health Care', 'Health Care',
       'Information Technology', 'Financials',
       'Health Care', 'Information Technology',
       'Utilities', 'Health Care', 'Financials',
       ...
       'Utilities', 'Information Technology',
       'Information Technology', 'Financials',
       'Industrials', 'Information Technology',
       'Consumer Discretionary', 'Health Care',
       'Financials', 'Health Care'],
      dtype='object', name='Sector', length=500)