## Indexing Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
        'population': [11.3, 64.3, 81.3, 16.9, 64.9],
        'area': [30510, 671308, 357050, 41526, 244820],
        'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']})


## Setting country name as index

In [3]:
data = data.set_index('country')

In [4]:
data

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,11.3,30510,Brussels
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
Netherlands,16.9,41526,Amsterdam
United Kingdom,64.9,244820,London


## Resetting index

In [5]:
data.reset_index()

Unnamed: 0,country,population,area,capital
0,Belgium,11.3,30510,Brussels
1,France,64.3,671308,Paris
2,Germany,81.3,357050,Berlin
3,Netherlands,16.9,41526,Amsterdam
4,United Kingdom,64.9,244820,London


In [6]:
data

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,11.3,30510,Brussels
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
Netherlands,16.9,41526,Amsterdam
United Kingdom,64.9,244820,London


## Basic selection of rows and columns

### single column

In [7]:
data['area']

country
Belgium            30510
France            671308
Germany           357050
Netherlands        41526
United Kingdom    244820
Name: area, dtype: int64

### multiple column

In [8]:
data[['area','capital']]

Unnamed: 0_level_0,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,30510,Brussels
France,671308,Paris
Germany,357050,Berlin
Netherlands,41526,Amsterdam
United Kingdom,244820,London


In [9]:
data['France':'Netherlands']

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
Netherlands,16.9,41526,Amsterdam


### Indexing with loc and iloc

When using `[]` like above, you can only select from one axis at once (rows or columns, not both). For more advanced indexing, you have some extra attributes:
    
* `loc`: selection by label
* `iloc`: selection by position

These methods index the different dimensions of the frame:

* `df.loc[row_indexer, column_indexer]`
* `df.iloc[row_indexer, column_indexer]`

## Selecting single element

In [10]:
data.loc['Germany','area']

357050

In [11]:
data

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,11.3,30510,Brussels
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
Netherlands,16.9,41526,Amsterdam
United Kingdom,64.9,244820,London


In [12]:
data.loc['Belgium':'Netherlands',['population','area']]

Unnamed: 0_level_0,population,area
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,11.3,30510
France,64.3,671308
Germany,81.3,357050
Netherlands,16.9,41526


In [13]:
data.loc['Belgium':'France','population':'area']

Unnamed: 0_level_0,population,area
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,11.3,30510
France,64.3,671308


In [14]:
data.iloc[0:2,1:3]

Unnamed: 0_level_0,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,30510,Brussels
France,671308,Paris


## Boolean Indexing(filtering)

In [16]:
data['area']>100000

country
Belgium           False
France             True
Germany            True
Netherlands       False
United Kingdom     True
Name: area, dtype: bool

In [17]:
data[data['area']>100000]

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
United Kingdom,64.9,244820,London


In [18]:
data['density'] = data['population']*1000000 / data['area']
data

Unnamed: 0_level_0,population,area,capital,density
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Belgium,11.3,30510,Brussels,370.37037
France,64.3,671308,Paris,95.783158
Germany,81.3,357050,Berlin,227.699202
Netherlands,16.9,41526,Amsterdam,406.973944
United Kingdom,64.9,244820,London,265.092721


## Select the capital and the population column of those countries where the density is larger than 300

In [20]:
data.loc[data['density']>300,['population','area']]

Unnamed: 0_level_0,population,area
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,11.3,30510
Netherlands,16.9,41526


## Add a column 'density_ratio' with the ratio of the density to the mean density

In [23]:
data['density_ratio'] = data['density']/data['density'].mean()
data

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Belgium,11.3,30510,Brussels,370.37037,1.355755
France,64.3,671308,Paris,95.783158,0.350618
Germany,81.3,357050,Berlin,227.699202,0.833502
Netherlands,16.9,41526,Amsterdam,406.973944,1.489744
United Kingdom,64.9,244820,London,265.092721,0.970382


## Change the capital of the UK to Cambridge

In [25]:
data['United Kingdom','capital'] = 'Cambridge'
data

Unnamed: 0_level_0,population,area,capital,density,density_ratio,"(United Kingdom, capital)"
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Belgium,11.3,30510,Brussels,370.37037,1.355755,Cambridge
France,64.3,671308,Paris,95.783158,0.350618,Cambridge
Germany,81.3,357050,Berlin,227.699202,0.833502,Cambridge
Netherlands,16.9,41526,Amsterdam,406.973944,1.489744,Cambridge
United Kingdom,64.9,244820,London,265.092721,0.970382,Cambridge


## Select all countries whose population density is between 100 and 300 people/km²

In [26]:
data[(data['density']>100) & (data['density']<300)]

Unnamed: 0_level_0,population,area,capital,density,density_ratio,"(United Kingdom, capital)"
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Germany,81.3,357050,Berlin,227.699202,0.833502,Cambridge
United Kingdom,64.9,244820,London,265.092721,0.970382,Cambridge
