# INDEXING AND SELECTING

In [1]:
import pandas as pd
import numpy as np

data = {
    'AN': (200.9, 102, 38, 645, 24, 675, 2143),
    'BT': (32.7, 244, 79, 743, 123, 765, 32),
    'CP': (259.2, 22, 144.4, 213, 65, 876, 43),
    'DM': (92, 462, 29.6, 132, 65, 97, 324),
    'EY': (291, 64.2, 11, 274, 56, 86, 243),
    'FI': (765, 21.4, 76, 453, 134, 876, 654),
    'GM': (12, 45, 10, 123, 65, 987, 4),
    'HV': (3, 91, 633, 33, 765, 31, 75)
}
df = pd.DataFrame(data, index=['2001', '2003', '2005', '2007', '2009', '2011', '2013'])
df

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2001,200.9,32.7,259.2,92.0,291.0,765.0,12,3
2003,102.0,244.0,22.0,462.0,64.2,21.4,45,91
2005,38.0,79.0,144.4,29.6,11.0,76.0,10,633
2007,645.0,743.0,213.0,132.0,274.0,453.0,123,33
2009,24.0,123.0,65.0,65.0,56.0,134.0,65,765
2011,675.0,765.0,876.0,97.0,86.0,876.0,987,31
2013,2143.0,32.0,43.0,324.0,243.0,654.0,4,75


## Basic indexing

In [7]:
df['BT']

2001     32.7
2003    244.0
2005     79.0
Name: BT, dtype: float64

In [8]:
df[['CP', 'DM', 'EY']]

Unnamed: 0,CP,DM,EY
2001,259.2,92.0,291.0
2003,22.0,462.0,64.2
2005,144.4,29.6,11.0


In [9]:
df['FF']

KeyError: 'FF'

In [10]:
df.get('FF', [])

[]

## Accessing attributes using dot operator

In [11]:
df.EY

2001    291.0
2003     64.2
2005     11.0
Name: EY, dtype: float64

In [12]:
df.FF

AttributeError: 'DataFrame' object has no attribute 'FF'

## Range slicing

In [23]:
df[-2::-2]

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2011,675.0,765.0,876.0,97.0,86.0,876.0,987,31
2007,645.0,743.0,213.0,132.0,274.0,453.0,123,33
2003,102.0,244.0,22.0,462.0,64.2,21.4,45,91


In [8]:
df['BT'][1:4]

2003    244.0
2005     79.0
2007    743.0
Name: BT, dtype: float64

In [9]:
df[1:4]['BT']

2003    244.0
2005     79.0
2007    743.0
Name: BT, dtype: float64

## Label, integer, and mixed indexing

### Label-oriented indexing

In [14]:
df.loc['2003']

AN    102.0
BT    244.0
CP     22.0
DM    462.0
EY     64.2
FI     21.4
GM     45.0
HV     91.0
Name: 2003, dtype: float64

In [15]:
df.loc[['2005', '2009']]

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2005,38.0,79.0,144.4,29.6,11.0,76.0,10,633
2009,24.0,123.0,65.0,65.0,56.0,134.0,65,765


In [16]:
df.loc['2007':'2013']

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2007,645.0,743.0,213.0,132.0,274.0,453.0,123,33
2009,24.0,123.0,65.0,65.0,56.0,134.0,65,765
2011,675.0,765.0,876.0,97.0,86.0,876.0,987,31
2013,2143.0,32.0,43.0,324.0,243.0,654.0,4,75


In [18]:
df.loc['2021']

KeyError: 'the label [2021] is not in the [index]'

In [22]:
df.loc['2007', 'CP']

213.0

In [23]:
df.loc['2007']['CP']

213.0

In [24]:
df.loc[:, 'EY']

2001    291.0
2003     64.2
2005     11.0
2007    274.0
2009     56.0
2011     86.0
2013    243.0
Name: EY, dtype: float64

### Selection using a Boolean array

In [26]:
df.loc[df['FI']>200,:]

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2001,200.9,32.7,259.2,92.0,291.0,765.0,12,3
2007,645.0,743.0,213.0,132.0,274.0,453.0,123,33
2011,675.0,765.0,876.0,97.0,86.0,876.0,987,31
2013,2143.0,32.0,43.0,324.0,243.0,654.0,4,75


In [27]:
df.loc[:, df.loc['2011']<700]

Unnamed: 0,AN,DM,EY,HV
2001,200.9,92.0,291.0,3
2003,102.0,462.0,64.2,91
2005,38.0,29.6,11.0,633
2007,645.0,132.0,274.0,33
2009,24.0,65.0,56.0,765
2011,675.0,97.0,86.0,31
2013,2143.0,324.0,243.0,75


## Integer-oriented indexing

In [29]:
df.iloc[2]

AN     38.0
BT     79.0
CP    144.4
DM     29.6
EY     11.0
FI     76.0
GM     10.0
HV    633.0
Name: 2005, dtype: float64

In [31]:
df.iloc[[0, 4, 5]]

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2001,200.9,32.7,259.2,92.0,291.0,765.0,12,3
2009,24.0,123.0,65.0,65.0,56.0,134.0,65,765
2011,675.0,765.0,876.0,97.0,86.0,876.0,987,31


In [33]:
df.iloc[-2:-9:-2]

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2011,675.0,765.0,876.0,97.0,86.0,876.0,987,31
2007,645.0,743.0,213.0,132.0,274.0,453.0,123,33
2003,102.0,244.0,22.0,462.0,64.2,21.4,45,91


In [34]:
df.iloc[3:6,2:4]

Unnamed: 0,CP,DM
2007,213.0,132.0
2009,65.0,65.0
2011,876.0,97.0


### The .iat and .at operators

In [36]:
%timeit df.iloc[4,1]

The slowest run took 5.73 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 179 µs per loop


In [37]:
%timeit df.iat[4,1]

The slowest run took 10.07 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.65 µs per loop


In [38]:
%timeit df.loc['2005', 'DM']

1000 loops, best of 3: 204 µs per loop


In [39]:
%timeit df.at['2005', 'DM']

The slowest run took 34.14 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 11.6 µs per loop


## Mixed indexing with the .ix operator

In [40]:
df.ix['2007']

AN    645.0
BT    743.0
CP    213.0
DM    132.0
EY    274.0
FI    453.0
GM    123.0
HV     33.0
Name: 2007, dtype: float64

In [42]:
df.ix[2]

AN     38.0
BT     79.0
CP    144.4
DM     29.6
EY     11.0
FI     76.0
GM     10.0
HV    633.0
Name: 2005, dtype: float64

In [43]:
df.ix['2005':'2011']

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2005,38.0,79.0,144.4,29.6,11.0,76.0,10,633
2007,645.0,743.0,213.0,132.0,274.0,453.0,123,33
2009,24.0,123.0,65.0,65.0,56.0,134.0,65,765
2011,675.0,765.0,876.0,97.0,86.0,876.0,987,31


In [45]:
df.ix[[1,4,6]]

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2003,102.0,244.0,22.0,462.0,64.2,21.4,45,91
2009,24.0,123.0,65.0,65.0,56.0,134.0,65,765
2013,2143.0,32.0,43.0,324.0,243.0,654.0,4,75


In [47]:
df.ix[2:4, 0:3]

Unnamed: 0,AN,BT,CP
2005,38.0,79.0,144.4
2007,645.0,743.0,213.0


In [48]:
df.ix[df['GM']<100]

Unnamed: 0,AN,BT,CP,DM,EY,FI,GM,HV
2001,200.9,32.7,259.2,92.0,291.0,765.0,12,3
2003,102.0,244.0,22.0,462.0,64.2,21.4,45,91
2005,38.0,79.0,144.4,29.6,11.0,76.0,10,633
2009,24.0,123.0,65.0,65.0,56.0,134.0,65,765
2013,2143.0,32.0,43.0,324.0,243.0,654.0,4,75


## MultiIndexing

In [2]:
dfMedals = pd.read_csv('../data/medals.csv')
dfMedals

Unnamed: 0,Year,Medal Type,US,Canada,England,Australia
0,2001,Gold,278,188,39,44
1,2001,Silver,324,235,82,66
2,2001,Bronze,446,399,100,15
3,2002,Gold,301,298,42,66
4,2002,Silver,378,222,228,88
5,2002,Bronze,502,245,165,173
6,2003,Gold,321,276,86,163
7,2003,Silver,322,263,76,184
8,2003,Bronze,423,165,97,136
9,2004,Gold,298,146,43,152


In [57]:
dfMedalsIndexed = dfMedals.set_index(['Year', 'Medal Type'])
indexMedal = dfMedalsIndexed.index
indexMedal

MultiIndex(levels=[[2001, 2002, 2003, 2004, 2005, 2006], ['Bronze', 'Gold', 'Silver']],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5], [1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]],
           names=['Year', 'Medal Type'])

In [56]:
dfMedalsIndexed

Unnamed: 0_level_0,Unnamed: 1_level_0,US,Canada,England,Australia
Year,Medal Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001,Gold,278,188,39,44
2001,Silver,324,235,82,66
2001,Bronze,446,399,100,15
2002,Gold,301,298,42,66
2002,Silver,378,222,228,88
2002,Bronze,502,245,165,173
2003,Gold,321,276,86,163
2003,Silver,322,263,76,184
2003,Bronze,423,165,97,136
2004,Gold,298,146,43,152


In [58]:
indexMedal.get_level_values(0)

Int64Index([2001, 2001, 2001, 2002, 2002, 2002, 2003, 2003, 2003, 2004, 2004,
            2004, 2005, 2005, 2005, 2006, 2006, 2006],
           dtype='int64', name='Year')

In [59]:
indexMedal.get_level_values(1)

Index(['Gold', 'Silver', 'Bronze', 'Gold', 'Silver', 'Bronze', 'Gold',
       'Silver', 'Bronze', 'Gold', 'Silver', 'Bronze', 'Gold', 'Silver',
       'Bronze', 'Gold', 'Silver', 'Bronze'],
      dtype='object', name='Medal Type')

In [74]:
dfMedalsIndexed.ix[2004]

Unnamed: 0_level_0,US,Canada,England,Australia
Medal Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gold,298,146,43,152
Silver,256,184,173,148
Bronze,422,226,75,152


In [75]:
dfMedalsIndexed.ix[2005, 'Silver']

US           257
Canada       251
England       61
Australia     83
Name: (2005, Silver), dtype: int64

In [81]:
dfMedalsIndexed.sortlevel(0).ix[(2002, 'Silver'):(2006, 'Silver')]

Unnamed: 0_level_0,Unnamed: 1_level_0,US,Canada,England,Australia
Year,Medal Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002,Silver,378,222,228,88
2003,Bronze,423,165,97,136
2003,Gold,321,276,86,163
2003,Silver,322,263,76,184
2004,Bronze,422,226,75,152
2004,Gold,298,146,43,152
2004,Silver,256,184,173,148
2005,Bronze,419,295,51,57
2005,Gold,311,248,83,73
2005,Silver,257,251,61,83


In [82]:
dfMedalsIndexed.ix[[(2003, 'Gold'), (2005, 'Silver'), (2006, 'Bronze')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,US,Canada,England,Australia
Year,Medal Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2003,Gold,321,276,86,163
2005,Silver,257,251,61,83
2006,Bronze,345,265,93,72
