## Indexing DataFrames

### create sales data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    "month": ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'],
    "eggs":[47, 110, 221, 77, 132, 205],
    "salt":[12.0, 50.0, 89.0, 87.0, np.NaN, 60.0],
    "spam":[17, 31, 72, 20, 52, 55]
}
df = pd.DataFrame.from_dict(data)

In [3]:
df

Unnamed: 0,eggs,month,salt,spam
0,47,Jan,12.0,17
1,110,Feb,50.0,31
2,221,Mar,89.0,72
3,77,Apr,87.0,20
4,132,May,,52
5,205,Jun,60.0,55


In [4]:
df = df.set_index('month')

In [5]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [59]:
### indexing using square brackets
df['salt']['Jan']

12.0

In [60]:
### using column attribute and row label
df.eggs['Mar']

221

In [61]:
### using .loc accessor
df.loc['May','spam']

52

In [62]:
### using .iloc assessor
df.iloc[4,2]

52

In [14]:
### select only some columns
df[['salt','eggs']]

Unnamed: 0_level_0,salt,eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,12.0,47
Feb,50.0,110
Mar,89.0,221
Apr,87.0,77
May,,132
Jun,60.0,205


### slicing DataFrame

In [63]:
### select a column
df['eggs']

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

In [6]:
type(df['eggs'])

pandas.core.series.Series

In [7]:
### slicing and indexing a series
df['eggs'][1:4]

month
Feb    110
Mar    221
Apr     77
Name: eggs, dtype: int64

In [65]:
### using .loc[...]
df.loc[:, 'eggs':'salt']

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
Feb,110,50.0
Mar,221,89.0
Apr,77,87.0
May,132,
Jun,205,60.0


In [66]:
df.loc['Jan':'Apr', :]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20


In [67]:
df.loc['Mar':'May', 'salt':'spam']

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


In [68]:
### using .iloc[..]
df.iloc[2:5, 1:]

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


In [9]:
df.iloc[:,0:3]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [70]:
df.iloc[2:5, 1:]

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


In [72]:
### using list
df.loc['Jan':'May', ['eggs', 'spam']]

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Feb,110,31
Mar,221,72
Apr,77,20
May,132,52


In [73]:
df.iloc[[0,4,5], 0:2]

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
May,132,
Jun,205,60.0


In [74]:
df['eggs']

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

In [75]:
type(df['eggs'])

pandas.core.series.Series

In [76]:
df[['eggs']]

Unnamed: 0_level_0,eggs
month,Unnamed: 1_level_1
Jan,47
Feb,110
Mar,221
Apr,77
May,132
Jun,205


In [77]:
type(df[['eggs']])

pandas.core.frame.DataFrame

### filter DataFrame

In [78]:
df.salt > 60

month
Jan    False
Feb    False
Mar     True
Apr     True
May    False
Jun    False
Name: salt, dtype: bool

In [79]:
### filtering with a bool series
df[df.salt>60]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mar,221,89.0,72
Apr,77,87.0,20


In [10]:
(df.salt>=50)&(df.eggs<200)

month
Jan    False
Feb     True
Mar    False
Apr     True
May    False
Jun    False
dtype: bool

In [80]:
df[(df.salt>=50)&(df.eggs<200)]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Feb,110,50.0,31
Apr,77,87.0,20


In [81]:
df[(df.salt>=50)|(df.eggs<200)]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [82]:
df2 = df.dcopy()
df2['bacon']=[0,0,50,60,70,80]
df2

Unnamed: 0_level_0,eggs,salt,spam,bacon
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,0
Feb,110,50.0,31,0
Mar,221,89.0,72,50
Apr,77,87.0,20,60
May,132,,52,70
Jun,205,60.0,55,80


In [83]:
df2.loc[:, df2.all()] # all nonzeros

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [84]:
df2.loc[:, df2.any()]

Unnamed: 0_level_0,eggs,salt,spam,bacon
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,0
Feb,110,50.0,31,0
Mar,221,89.0,72,50
Apr,77,87.0,20,60
May,132,,52,70
Jun,205,60.0,55,80


In [87]:
df.loc[:, df.isnull().any()]

Unnamed: 0_level_0,salt
month,Unnamed: 1_level_1
Jan,12.0
Feb,50.0
Mar,89.0
Apr,87.0
May,
Jun,60.0


In [88]:
df.loc[:, df.notnull().all()]

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Feb,110,31
Mar,221,72
Apr,77,20
May,132,52
Jun,205,55


In [89]:
df.dropna(how='any')

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
Jun,205,60.0,55


In [90]:
df.eggs[df.salt>55]

month
Mar    221
Apr     77
Jun    205
Name: eggs, dtype: int64

In [12]:
df.eggs[df.salt>55] += 5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,226,89.0,72
Apr,82,87.0,20
May,132,,52
Jun,210,60.0,55


## Transforming DataFrames

### Pandas Vectorized function

In [5]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [6]:
df.floordiv(12) #除12後小數點後捨去

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


### Numpy vectorized function

In [15]:
np.floor_divide(df, 12)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3.0,1.0,1.0
Feb,9.0,4.0,2.0
Mar,18.0,7.0,6.0
Apr,6.0,7.0,1.0
May,11.0,,4.0
Jun,17.0,5.0,4.0


### plain python functions

In [19]:
def dozens(n):
    return n//12

In [20]:
df.apply(dozens)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [21]:
df.apply(lambda n: n//12) #lambda function

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [12]:
df['dozens_of_eggs'] = df.eggs.floordiv(12)

In [13]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,3
Feb,110,50.0,31,9
Mar,221,89.0,72,18
Apr,77,87.0,20,6
May,132,,52,11
Jun,205,60.0,55,17


In [15]:
df.index

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], dtype='object', name='month')

In [18]:
df.index = df.index.str.upper()

In [19]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JAN,47,12.0,17,3
FEB,110,50.0,31,9
MAR,221,89.0,72,18
APR,77,87.0,20,6
MAY,132,,52,11
JUN,205,60.0,55,17


In [20]:
df.index = df.index.map(str.lower)

In [21]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
jan,47,12.0,17,3
feb,110,50.0,31,9
mar,221,89.0,72,18
apr,77,87.0,20,6
may,132,,52,11
jun,205,60.0,55,17


In [25]:
df['salty_eggs'] = df.salt + df.dozens_of_eggs

In [26]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs,salty_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jan,47,12.0,17,3,15.0
feb,110,50.0,31,9,59.0
mar,221,89.0,72,18,107.0
apr,77,87.0,20,6,93.0
may,132,,52,11,
jun,205,60.0,55,17,77.0


## Index objects and labeled data

### pandas data structures

* key building blocks
  - indexes: sequence of labels
  - series: 1D array with index
  - DataFrame: 2D array with Series as columns
* indexes
  - immutable
  - homogenous in data type

In [22]:
prices = [10.7, 10.86, 10.74, 10.71, 10.79]
shares = pd.Series(prices)
print(shares)

0    10.70
1    10.86
2    10.74
3    10.71
4    10.79
dtype: float64


In [25]:
days = ['mon','tue','wed','thu', 'fri']
shares = pd.Series(prices, index=days)
print(shares.index)

Index(['mon', 'tue', 'wed', 'thu', 'fri'], dtype='object')


In [40]:
print(shares.index[2])

wed


In [26]:
print(shares.index[-2:])

Index(['thu', 'fri'], dtype='object')


In [27]:
shares.index.name="Weekdays"

In [28]:
shares

Weekdays
mon    10.70
tue    10.86
wed    10.74
thu    10.71
fri    10.79
dtype: float64

In [44]:
shares.index[2]="Wednesday"

TypeError: Index does not support mutable operations

In [45]:
shares.index[:4]= ['momdy','tuesday','wednesday','thursday']

TypeError: Index does not support mutable operations

In [49]:
shares.index= ['週一', '週二','週三','週四','週五']

In [53]:
shares

Weekdays
週一    10.70
週二    10.86
週三    10.74
週四    10.71
週五    10.79
dtype: float64

## MultiIndex in DataFrame

In [54]:
import pandas, io

In [76]:
data = io.StringIO('''Fruit,color,count,price
Apple, Red, 3, $1.29
Apple, Green, 9, $0.99
Pear, Red, 25, $2.59
Pear, Green, 26, $2.79
Lime, Green, 99, $0.39
''')
df_unindexed = pd.read_csv(data)
df_unindexed

Unnamed: 0,Fruit,color,count,price
0,Apple,Red,3,$1.29
1,Apple,Green,9,$0.99
2,Pear,Red,25,$2.59
3,Pear,Green,26,$2.79
4,Lime,Green,99,$0.39


In [77]:
df_mi = df_unindexed.set_index(['Fruit', 'color'])

In [78]:
print(df_mi)

              count   price
Fruit color                
Apple  Red        3   $1.29
       Green      9   $0.99
Pear   Red       25   $2.59
       Green     26   $2.79
Lime   Green     99   $0.39


In [79]:
print(df_mi.index)

MultiIndex(levels=[['Apple', 'Lime', 'Pear'], [' Green', ' Red']],
           labels=[[0, 0, 2, 2, 1], [1, 0, 1, 0, 0]],
           names=['Fruit', 'color'])


In [80]:
print(df_mi.index.name)

None


In [81]:
print(df_mi.index.names)

['Fruit', 'color']


In [82]:
df_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,count,price
Fruit,color,Unnamed: 2_level_1,Unnamed: 3_level_1
Apple,Red,3,$1.29
Apple,Green,9,$0.99
Pear,Red,25,$2.59
Pear,Green,26,$2.79
Lime,Green,99,$0.39


In [83]:
df_mi_sorted = df_mi.sort_index()

In [90]:
df_mi_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,count,price
Fruit,color,Unnamed: 2_level_1,Unnamed: 3_level_1
Apple,Green,9,$0.99
Apple,Red,3,$1.29
Lime,Green,99,$0.39
Pear,Green,26,$2.79
Pear,Red,25,$2.59


In [142]:
df_mi.loc['Pear']

Unnamed: 0_level_0,count,price
color,Unnamed: 1_level_1,Unnamed: 2_level_1
Red,25,$2.59
Green,26,$2.79


In [92]:
df_mi_sorted.loc['Apple':'Pear', :]

Unnamed: 0_level_0,Unnamed: 1_level_0,count,price
Fruit,color,Unnamed: 2_level_1,Unnamed: 3_level_1
Apple,Green,9,$0.99
Apple,Red,3,$1.29
Lime,Green,99,$0.39
Pear,Green,26,$2.79
Pear,Red,25,$2.59


In [101]:
df_mi.loc[('Apple','Red')]

KeyError: ('Apple', 'Red')

In [103]:
df_mi.index.names

FrozenList(['Fruit', 'color'])

In [105]:
df_mi.loc['Apple']

Unnamed: 0_level_0,count,price
color,Unnamed: 1_level_1,Unnamed: 2_level_1
Red,3,$1.29
Green,9,$0.99


In [113]:
df_mi.loc[['Apple','Pear'],:]

UnsortedIndexError: 'MultiIndex Slicing requires the index to be fully lexsorted tuple len (1), lexsort depth (0)'

In [114]:
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
   ....:                     'joe': ['x', 'x', 'z', 'y'],
   ....:                     'jolie': np.random.rand(4)})

In [115]:
dfm

Unnamed: 0,jim,joe,jolie
0,0,x,0.63049
1,0,x,0.582891
2,1,z,0.800795
3,1,y,0.905763


In [126]:
dfm = dfm.set_index(['jim','joe'])

In [129]:
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.63049
0,x,0.582891
1,z,0.800795
1,y,0.905763


In [133]:
dfm.loc[(0,'x')]

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,jolie
jim,joe,Unnamed: 2_level_1
0,x,0.63049
0,x,0.582891
