In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
sp500 = pd.read_csv('data/sp500.csv', index_col='Symbol', usecols= [0, 2, 3, 7])

In [6]:
np.random.seed(123456)
df = pd.DataFrame({'foo' : np.random.random(10000), 'key' : range(100, 10100)})
df[:5]

Unnamed: 0,foo,key
0,0.12697,100
1,0.966718,101
2,0.260476,102
3,0.897237,103
4,0.37675,104


In [7]:
df[df.key == 10099]

Unnamed: 0,foo,key
9999,0.272283,10099


In [8]:
%timeit df[df.key==10099]

397 µs ± 2.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
df_with_index = df.set_index(['key'])
df_with_index[:5]

Unnamed: 0_level_0,foo
key,Unnamed: 1_level_1
100,0.12697
101,0.966718
102,0.260476
103,0.897237
104,0.37675


In [10]:
df_with_index.loc[10099]

foo    0.272283
Name: 10099, dtype: float64

In [11]:
# 이 결과를 봤을때 인덱스를 사용한 검색이 대략 5배는 빠르다.
%timeit df_with_index.loc[10099]

82.2 µs ± 595 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [12]:
temps = pd.DataFrame({'City' : ['Missoula' , 'Philadelphia'], 'Temperature' : [70, 80]})
temps

Unnamed: 0,City,Temperature
0,Missoula,70
1,Philadelphia,80


In [13]:
temps.columns

Index(['City', 'Temperature'], dtype='object')

In [14]:
df_164 = pd.DataFrame(np.arange(10, 20), index = np.arange(0, 10))
df_164

Unnamed: 0,0
0,10
1,11
2,12
3,13
4,14
5,15
6,16
7,17
8,18
9,19


In [16]:
df_range = pd.DataFrame(np.arange(10, 15))
df_range.head()

Unnamed: 0,0
0,10
1,11
2,12
3,13
4,14


In [18]:
df_range.index

RangeIndex(start=0, stop=5, step=1)

In [19]:
df_f64 = pd.DataFrame(np.arange(0, 1000, 5), np.arange(0.0, 100.0, 0.5))
df_f64.head()

Unnamed: 0,0
0.0,0
0.5,5
1.0,10
1.5,15
2.0,20


In [20]:
df_f64.index

Float64Index([ 0.0,  0.5,  1.0,  1.5,  2.0,  2.5,  3.0,  3.5,  4.0,  4.5,
              ...
              95.0, 95.5, 96.0, 96.5, 97.0, 97.5, 98.0, 98.5, 99.0, 99.5],
             dtype='float64', length=200)

In [21]:
df_interval = pd.DataFrame({ 'A': [1, 2, 3, 4]}, index = pd.IntervalIndex.from_breaks([0, 0.5, 1.0, 1.5, 2.0]))
df_interval

Unnamed: 0,A
"(0.0, 0.5]",1
"(0.5, 1.0]",2
"(1.0, 1.5]",3
"(1.5, 2.0]",4


In [31]:
# df_categorial = pd.DataFrame({'A' : np.arange(6), 'B' : list('aabbca')})
# df_categorial['B'] = df_categorial['B'].astype('category', categories=list('cab'))
df_categorial = df_categorial.set_index('B')
df_categorial.index

Index(['a', 'a', 'b', 'b', 'c', 'a'], dtype='object', name='B')

In [32]:
df_categorial.loc['a']

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
a,0
a,1
a,5


In [3]:
rng = pd.date_range('5/1/2017', periods = 5, freq='H')
ts = pd.Series(np.random.randn(len(rng)), index = rng)
ts

2017-05-01 00:00:00    0.688096
2017-05-01 01:00:00   -0.742108
2017-05-01 02:00:00    0.606819
2017-05-01 03:00:00    0.330280
2017-05-01 04:00:00   -0.255339
Freq: H, dtype: float64

In [4]:
ts.index

DatetimeIndex(['2017-05-01 00:00:00', '2017-05-01 01:00:00',
               '2017-05-01 02:00:00', '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [5]:
periods = pd.PeriodIndex(['2017-1', '2017-2', '2017-3'], freq = 'M')
periods

PeriodIndex(['2017-01', '2017-02', '2017-03'], dtype='period[M]', freq='M')

In [7]:
period_series = pd.Series(np.random.randn(len(periods)), index = periods)
period_series

2017-01    1.095441
2017-02   -1.247358
2017-03   -0.733901
Freq: M, dtype: float64

In [10]:
date_times = pd.DatetimeIndex(pd.date_range('5/1/2017', periods = 5, freq='H'))
date_times

DatetimeIndex(['2017-05-01 00:00:00', '2017-05-01 01:00:00',
               '2017-05-01 02:00:00', '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [12]:
df_date_times = pd.DataFrame(np.arange(0, len(date_times)), index=date_times)
df_date_times

Unnamed: 0,0
2017-05-01 00:00:00,0
2017-05-01 01:00:00,1
2017-05-01 02:00:00,2
2017-05-01 03:00:00,3
2017-05-01 04:00:00,4


In [13]:
df_date_times.index = pd.DatetimeIndex(pd.date_range('6/1/2017', periods= 5, freq='H'))
df_date_times

Unnamed: 0,0
2017-06-01 00:00:00,0
2017-06-01 01:00:00,1
2017-06-01 02:00:00,2
2017-06-01 03:00:00,3
2017-06-01 04:00:00,4


In [15]:
s = pd.Series(np.arange(0, 5), index = list('abcde'))
s

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [16]:
s['b']

1

In [17]:
s.loc['b']

1

In [18]:
df = pd.DataFrame([np.arange(10, 12), np.arange(12, 14)], columns=list('ab'), index = list('vw'))
df

Unnamed: 0,a,b
v,10,11
w,12,13


In [19]:
df['a']

v    10
w    12
Name: a, dtype: int64

In [20]:
df.loc['w']

a    12
b    13
Name: w, dtype: int64

In [21]:
s['b':'d']

b    1
c    2
d    3
dtype: int32

In [22]:
s.loc['b':'d']

b    1
c    2
d    3
dtype: int32

In [23]:
s.loc[['a', 'c', 'e']]

a    0
c    2
e    4
dtype: int32

In [25]:
sp500[:5]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897


In [26]:
index_moved_to_col = sp500.reset_index()
index_moved_to_col

Unnamed: 0,Symbol,Sector,Price,Book Value
0,MMM,Industrials,141.14,26.668
1,ABT,Health Care,39.60,15.573
2,ABBV,Health Care,53.95,2.954
3,ACN,Information Technology,79.79,8.326
4,ACE,Financials,102.91,86.897
...,...,...,...,...
495,YHOO,Information Technology,35.02,12.768
496,YUM,Consumer Discretionary,74.77,5.147
497,ZMH,Health Care,101.84,37.181
498,ZION,Financials,28.43,30.191


In [27]:
index_moved_to_col.set_index('Sector').head()

Unnamed: 0_level_0,Symbol,Price,Book Value
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.6,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897


In [29]:
reindexed = sp500.reindex(index = ['MMM', 'ABBV', 'FOO'])
reindexed

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABBV,Health Care,53.95,2.954
FOO,,,


In [30]:
sp500.reindex(columns=['Price', 'Book Value', 'NewCol']).head()

Unnamed: 0_level_0,Price,Book Value,NewCol
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,141.14,26.668,
ABT,39.6,15.573,
ABBV,53.95,2.954,
ACN,79.79,8.326,
ACE,102.91,86.897,


In [31]:
reindexed = sp500.reset_index()
reindexed

Unnamed: 0,Symbol,Sector,Price,Book Value
0,MMM,Industrials,141.14,26.668
1,ABT,Health Care,39.60,15.573
2,ABBV,Health Care,53.95,2.954
3,ACN,Information Technology,79.79,8.326
4,ACE,Financials,102.91,86.897
...,...,...,...,...
495,YHOO,Information Technology,35.02,12.768
496,YUM,Consumer Discretionary,74.77,5.147
497,ZMH,Health Care,101.84,37.181
498,ZION,Financials,28.43,30.191


In [32]:
multi_fi = reindexed.set_index(['Sector', 'Symbol'])
multi_fi

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Book Value
Sector,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Health Care,ABT,39.60,15.573
Health Care,ABBV,53.95,2.954
Information Technology,ACN,79.79,8.326
Financials,ACE,102.91,86.897
...,...,...,...
Information Technology,YHOO,35.02,12.768
Consumer Discretionary,YUM,74.77,5.147
Health Care,ZMH,101.84,37.181
Financials,ZION,28.43,30.191


In [33]:
type(multi_fi.index)

pandas.core.indexes.multi.MultiIndex

In [34]:
len(multi_fi.index.levels)

2

In [35]:
multi_fi.index.levels[0]

Index(['Consumer Discretionary', 'Consumer Discretionary ', 'Consumer Staples',
       'Consumer Staples ', 'Energy', 'Financials', 'Health Care',
       'Industrials', 'Industries', 'Information Technology', 'Materials',
       'Telecommunications Services', 'Utilities'],
      dtype='object', name='Sector')

In [36]:
multi_fi.index.levels[1]

Index(['A', 'AA', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACE', 'ACN', 'ACT', 'ADBE',
       ...
       'XLNX', 'XOM', 'XRAY', 'XRX', 'XYL', 'YHOO', 'YUM', 'ZION', 'ZMH',
       'ZTS'],
      dtype='object', name='Symbol', length=500)

In [38]:
multi_fi.index.get_level_values(0)

Index(['Industrials', 'Health Care', 'Health Care', 'Information Technology',
       'Financials', 'Health Care', 'Information Technology', 'Utilities',
       'Health Care', 'Financials',
       ...
       'Utilities', 'Information Technology', 'Information Technology',
       'Financials', 'Industrials', 'Information Technology',
       'Consumer Discretionary', 'Health Care', 'Financials', 'Health Care'],
      dtype='object', name='Sector', length=500)

In [39]:
multi_fi.xs('Industrials')

Unnamed: 0_level_0,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ALLE,52.46,0.000
APH,95.71,18.315
AVY,48.20,15.616
BA,132.41,19.870
...,...,...
UNP,196.26,46.957
UPS,102.73,6.790
UTX,115.54,35.252
WM,43.37,12.330


In [42]:
multi_fi.xs('ALLE', level=1)

Unnamed: 0_level_0,Price,Book Value
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Industrials,52.46,0.0


In [43]:
multi_fi.xs('Industrials', drop_level=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Book Value
Sector,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
Industrials,MMM,141.14,26.668
Industrials,ALLE,52.46,0.000
Industrials,APH,95.71,18.315
Industrials,AVY,48.20,15.616
Industrials,BA,132.41,19.870
Industrials,...,...,...
Industrials,UNP,196.26,46.957
Industrials,UPS,102.73,6.790
Industrials,UTX,115.54,35.252
Industrials,WM,43.37,12.330


In [44]:
multi_fi.xs('Industrials').xs('UPS')

Price         102.73
Book Value      6.79
Name: UPS, dtype: float64

In [45]:
multi_fi.xs(('Industrials', 'UPS'))

Price         102.73
Book Value      6.79
Name: (Industrials, UPS), dtype: float64