In [105]:
import pandas as pd
import numpy as np

In [78]:
pd.__version__


'1.2.0'

# Đối tượng Series và cách tạo

Series là mảng 1 chiều gồm dữ liệu được đánh chỉ số

Cú pháp hàm tạo rút gọn: pd.Series(data=None, index=None)

với:
- data là đối tượng chứa dữ liệu của Series (có kiểu array-like, Iterable, dict, hay scalar value)
- index là chuỗi giá trị có cùng độ dài với data (có kiểu array-like hay Index (1d))



Nó có thể được tạo từ 1 danh sách:

In [108]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

Series bao gồm cả chuỗi giá trị và chuỗi chỉ số mà chúng ta có thể truy cập bằng các thuộc tính values và index

In [109]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [110]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [111]:
data[1]

0.5

In [83]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [112]:
# data can be a list or NumPy array, index defaults to an integer sequence 
pd.Series([2,4,6])
pd.Series(np.arange(2,7,2))

0    2
1    4
2    6
dtype: int64

In [85]:
# data can be a scalar, which is repeated to fill the specified index 
pd.Series(5,index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [113]:
# data can be a dictionary, in which index defaults to the dictionary keys
pd.Series({2:'a',1:'b',3:'c'})

2    a
1    b
3    c
dtype: object

In [114]:
# index can be explicitly set if a different result is preferred 
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3,2])

3    c
2    a
dtype: object

# Series như là mảng NumPy

Chỉ số của Series không cần phải là số nguyên mà có thể chứa giá trị có kiểu bất kỳ

In [88]:
# use strings as indices
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd']) 
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [89]:
# use noncontiguous or nonsequential indices
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7]) 
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

# Series như là từ điển đặc biệt

• Từ điển là cấu trúc ánh xạ các khoá bất kỳ sang các giá trị bất kỳ

• Series là cấu trúc ánh xạ các khoá định kiểu sang các giá trị định kiểu

• Thông tin kiểu của Series làm nó thực thi hiệu quả hơn nhiều so với Từ điển trong một số tác vụ

Có thể tạo Series từ Từ điển

In [115]:
population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860}
population = pd.Series(population_dict)
population


California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

Truy nhập giá trị thông qua khoá tương tự như Từ điển

In [91]:
population['California']

38332521

Khác với Từ điển, Series hỗ trợ các thao tác mảng như “cắt lát” (slicing)

In [92]:
population['California':'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

# DataFrame

DataFrame giống với mảng NumPy 2 chiều có tên cột và chỉ số hàng

Có thể xem DataFrame như chuỗi gồm các Series chia sẻ cùng chỉ số

In [117]:
 # First, construct a Series listing the area of 3 states
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297} 
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
dtype: int64

In [118]:
 # Second, construct a Series listing the population of 3 states
population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
dtype: int64

In [95]:
 # Final, use a dictionary to create a DataFrame object containing this information
states = pd.DataFrame({'population': population, 'area': area}) 
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


In [119]:
states.index

Index(['California', 'Texas', 'New York'], dtype='object')

In [120]:
states.columns

Index(['population', 'area'], dtype='object')

In [121]:
states['area']

California    423967
Texas         695662
New York      141297
Name: area, dtype: int64

In [99]:
# a single column DataFrame can be constructed from a single Series
pd.DataFrame(population, columns=['population'])


Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127


In [100]:
# Any list of dictionaries can be made into a DataFrame. 
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [101]:
# create a DataFrame with any specified column and index names from a 2D NumPy array
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'],index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.631567,0.314247
b,0.390522,0.914862
c,0.643343,0.169972


In [126]:
# if some keys in the dictionary are missing, Pandas will fill them in with NaN values
pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])
# pd.DataFrame([{'a':1,'b':2,'c':np.NaN },{'a':np.NaN,'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [104]:
# construct an Index from a list of integers
ind = pd.Index([2, 5, 7, 11])
ind

Int64Index([2, 5, 7, 11], dtype='int64')

In [42]:
# can use standard Python indexing notation to retrieve values or slices
ind[1]

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [43]:
# can use standard Python indexing notation to retrieve values or slices
ind[1]

3

In [44]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [45]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [46]:
ind[1] = 0

TypeError: Index does not support mutable operations

In [48]:
indA = pd.Index([1, 3, 5, 7, 9]) 
indB = pd.Index([2, 3, 5, 7, 11]) 
indA & indB # intersection

Int64Index([3, 5, 7], dtype='int64')

In [49]:
indA | indB # union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [50]:
indA ^ indB # symmetric difference

Int64Index([1, 2, 9, 11], dtype='int64')

In [325]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [326]:
# Series object provides a mapping from a collection of keys to a collection of values
data['b']

0.5

In [327]:
# use dictionary-like Python expressions and methods to examine the keys/indices and values
'a' in data

True

In [328]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [329]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [330]:
# extend a Series by assigning to a new index value
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [331]:
# slicing by explicit index 
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [332]:
# slicing by implicit integer index 
data[0:2]

a    0.25
b    0.50
dtype: float64

In [333]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [334]:
# fancy indexing 
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [336]:
# so sánh hàm tạo với fancy indexing
data1 = pd.Series(data, index=['a','e'])
data1

a    0.25
e    1.25
dtype: float64

In [340]:
# so sánh masking với fancy indexing
data2 = data[(data.index=='a') | (data.index=='e')]
data2

a    0.25
e    1.25
dtype: float64

Các thuộc tính định vị trí (indexer): loc & iloc

In [241]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [242]:
data.loc[1]

'a'

In [243]:
data.loc[1:3]

1    a
3    b
dtype: object

In [244]:
data.iloc[1]

'b'

In [245]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [246]:
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,'New York': 19651127})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [247]:
# individual Series of the DataFrame can be accessed via dictionary-style indexing of the column name
data['area']

California    423967
Texas         695662
New York      141297
Name: area, dtype: int64

In [248]:
# we can use attribute-style access with column names that are strings
data.area

California    423967
Texas         695662
New York      141297
Name: area, dtype: int64

In [249]:
data.pop

<bound method NDFrame.pop of               area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127>

In [250]:
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [251]:
# this dictionary-style syntax can also be used to modify the object, in this case to add a new column
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [252]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02]])

In [253]:
# We can examine the raw underlying data array using the values attribute
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02]])

In [254]:
# transpose the full DataFrame to swap rows and columns
data.T

Unnamed: 0,California,Texas,New York
area,423967.0,695662.0,141297.0
pop,38332520.0,26448190.0,19651130.0
density,90.41393,38.01874,139.0767


In [255]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [256]:
data['area']

California    423967
Texas         695662
New York      141297
Name: area, dtype: int64

In [257]:
# Using the iloc indexer, we can index the underlying array as if it is a simple NumPy array
data.iloc[:2, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193


In [258]:
data.loc[:'Texas', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193


In [259]:
# in the loc indexer we can combine masking and fancy indexing
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746


In [260]:
# modifying values is done in the similar way with NumPy
data.iloc[0, 2] = 90
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [261]:
# slicing refers to rows
data['Texas':'New York']

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [262]:

data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [263]:
# Such slices can also refer to rows by number rather than by index
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [264]:
# direct masking operations are also interpreted row-wise rather than column-wise
data[data.density > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746


In [265]:
# defining a simple Series
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [266]:
# defining a simple DataFrame
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A','B','C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [267]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [268]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [269]:
# suppose we are combining two different data sources: area & population
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population')

In [270]:
# then compute the population density
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [271]:
# any missing values are filled in with NaN by default for any of Python’s built-in arithmetic expressions
A = pd.Series([2, 4, 6], index=[0, 1, 2]) 
B = pd.Series([1, 3, 5], index=[1, 2, 3]) 
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [272]:
# we can modify the fill value using appropriate object methods in place of the operators
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

Index alignment in DataFrame

A similar type of alignment takes place for both columns and indices when you are performing operations on DataFrames

In [273]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


In [274]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [275]:
A + B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [276]:
# we’ll fill with the mean of all values in A (which we compute by first stacking the rows of A)
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


In [277]:
# find the difference of a two-dimensional array and one of its rows
A = rng.randint(10, size=(3, 4)) 
A

array([[3, 8, 2, 4],
       [2, 6, 4, 8],
       [6, 1, 3, 8]])

In [278]:
A - A[0] # subtraction is applied row-wise

array([[ 0,  0,  0,  0],
       [-1, -2,  2,  4],
       [ 3, -7,  1,  4]])

In [279]:
# In Pandas, the convention similarly operates row-wise by default
df = pd.DataFrame(A, columns=list('QRST')) 
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-1,-2,2,4
2,3,-7,1,4


In [280]:
# If you would instead like to operate column-wise, you can use the object methods mentioned earlier, while specifying the axis keyword
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,-5,0,-6,-4
1,-4,0,-2,2
2,5,0,2,7


In [281]:
# Note that these DataFrame/Series operations, like the operations discussed before, will automatically align indices between the two elements
df

Unnamed: 0,Q,R,S,T
0,3,8,2,4
1,2,6,4,8
2,6,1,3,8


In [282]:
halfrow = df.iloc[0, ::2]
halfrow

Q    3
S    2
Name: 0, dtype: int64

In [283]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-1.0,,2.0,
2,3.0,,1.0,


In [284]:
index = [('California', 2000), ('California', 2010),
                    ('New York', 2000), ('New York', 2010),
                    ('Texas', 2000), ('Texas', 2010)]

In [285]:
populations = [33871648, 37253956,
                          18976457, 19378102,
                          20851820, 25145561]

In [286]:
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [287]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [288]:
# We can create a multi-index from the tuples as follows
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [289]:
pop = pop.reindex(index) 
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [290]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [291]:
pop_df = pop.unstack() 
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [292]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [293]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [294]:
type(pop)

pandas.core.series.Series

In [295]:
pop_df = pd.DataFrame({'total': pop,'under18': [9267089, 9284094,4687374, 4318033,5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [296]:
# compute the fraction of people under 18 by year
f_u18 = pop_df['under18'] / pop_df['total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [297]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [298]:
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [299]:
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.551737,0.869688
a,2,0.269804,0.166352
b,1,0.913341,0.992192
b,2,0.926699,0.17109


In [300]:
data = {('California', 2000): 33871648,
                    ('California', 2010): 37253956,
                    ('Texas', 2000): 20851820,
                    ('Texas', 2010): 25145561,
                    ('New York', 2000): 18976457,
                    ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [301]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [302]:
type(pd.MultiIndex) 

type

In [303]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [304]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [305]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [306]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]])




MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

MultiIndex for columns

In [307]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],names=['subject', 'type'])
# mock some data
data = np.round(np.random.randn(4, 6), 1) 
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns) 
health_data


Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,47.0,37.0,54.0,36.5,41.0,35.9
2013,2,41.0,39.1,32.0,35.7,29.0,35.9
2014,1,47.0,35.1,27.0,37.3,27.0,37.0
2014,2,16.0,38.7,36.0,37.3,40.0,36.3


In [308]:
# index the top-level column by the person’s name and get a full Data Frame containing just that person’s information
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,54.0,36.5
2013,2,32.0,35.7
2014,1,27.0,37.3
2014,2,36.0,37.3


In [309]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [310]:
# We can access single elements by indexing with multiple terms
pop['California', 2000]

33871648

In [311]:
# MultiIndex also supports partial indexing, or indexing just one of the levels in the index
pop['California']

2000    33871648
2010    37253956
dtype: int64

In [312]:
# Partial slicing is available as well, as long as the MultiIndex is sorted
pop.loc['California':'New York']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [313]:
# With sorted indices, we can perform partial indexing on lower levels by passing an empty slice in the first index
pop[:, 2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [314]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,47.0,37.0,54.0,36.5,41.0,35.9
2013,2,41.0,39.1,32.0,35.7,29.0,35.9
2014,1,47.0,35.1,27.0,37.3,27.0,37.0
2014,2,16.0,38.7,36.0,37.3,40.0,36.3


In [315]:
# we can recover Guido’s heart rate data
health_data['Guido', 'HR']

year  visit
2013  1        54.0
      2        32.0
2014  1        27.0
      2        36.0
Name: (Guido, HR), dtype: float64

In [316]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,47.0,37.0
2013,2,41.0,39.1


In [317]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        47.0
      2        41.0
2014  1        47.0
      2        16.0
Name: (Bob, HR), dtype: float64

In [318]:
# average out the measurements in the two visits each year
data_mean = health_data.mean(level='year') 
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,44.0,38.05,43.0,36.1,35.0,35.9
2014,31.5,36.9,31.5,37.3,33.5,36.65


In [319]:
# By further making use of the axis keyword, we can take the mean among levels on the columns as well
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,40.666667,36.683333
2014,32.166667,36.95
