In [4]:

import pandas as pd
import numpy as np
pd.set_option("display.show_dimensions", False)
pd.set_option("display.float_format", "{:4.2g}".format)

## 下标存取

DataFrame对象的各种存取方法

| 方法                 | 说明                 |
|:-----------------------------|:-----------------------------|
|[col_label]     | 单个标签做下标, 返回Series对象  |
|[col_labels]     |标签列表做下标, 返回DataFrame对象  |
|[row_slice]     | 整数切片或标签切片, 返回指定范围内的行  |
|[row_bool_array]  | 选择布尔数组中True对应的行 |
|:|:|
|at(index_label,col_label)  | 返回行标签和列标签对应的单个值 |
|iat(index,col)  |返回行编号和列编号对应的单个值  |
|get_value(raw_label,col_label)|选择行标签和列标签所对应的单个值|
|get(col_label,default)| 访问列标签所对应的值,可以访问不存在的列 |
|lookup(row_labels,col_labels)|标签列表做参,返回每对标签对应的值|
|:|:|
|ix(index,col)  | 既可以使用整数下标也可以使用切片下标|
|:|:|
|loc(index,col)  |单个标签,标签列表,标签数组,布尔数组,标签切片 |
|iloc(index,col) |单个整数值,整数列表,整数数组,布尔数组,整数切片 |
|:|:  |
|head()  | 获取头部n行数据  |
|tail()  | 获得尾部n行数据 |
|query() | 通过表达式选择满足条件的行 |


In [9]:
np.random.seed(42)
df = pd.DataFrame(np.random.randint(0, 10, (5, 3)), 
                  index=["r1", "r2", "r3", "r4", "r5"], 
                  columns=["c1", "c2", "c3"])
df

Unnamed: 0,c1,c2,c3
r1,6,3,7
r2,4,6,9
r3,2,6,7
r4,4,3,7
r5,7,2,5


### `[]`操作符

整数切片不包括尾部, 标签切片包含尾部

In [10]:
%C 5 df[2:4]; df["r2":"r4"]

   df[2:4]         df["r2":"r4"] 
--------------     --------------
    c1  c2  c3         c1  c2  c3
r3   2   6   7     r2   4   6   9
r4   4   3   7     r3   2   6   7
                   r4   4   3   7


布尔数组 和 DataFrame布尔数组, 布尔数组做下标只能获得True所对应的值

In [13]:
%C 5 df.c1 > 4; df > 2

      df.c1 > 4                   df > 2        
---------------------     ----------------------
r1     True                      c1     c2    c3
r2    False               r1   True   True  True
r3    False               r2   True   True  True
r4    False               r3  False   True  True
r5     True               r4   True   True  True
Name: c1, dtype: bool     r5   True  False  True


In [14]:
%C 5 df[df.c1 > 4]; df[df > 2]

df[df.c1 > 4]         df[df > 2]   
--------------     ----------------
    c1  c2  c3          c1   c2  c3
r1   6   3   7     r1    6    3   7
r5   7   2   5     r2    4    6   9
                   r3  nan    6   7
                   r4    4    3   7
                   r5    7  nan   5


### `.loc[]`和`.iloc[]`存取器

单个标签对做loc()的参数

In [16]:
%C 5 df.loc["r2"]; df.loc["r2","c2"]

     df.loc["r2"]          df.loc["r2","c2"]
----------------------     -----------------
c1    4                    6                
c2    6                                     
c3    9                                     
Name: r2, dtype: int32                      


标签列表做loc()的参数

In [18]:
%C 5 df.loc[["r2","r3"]]; df.loc[["r2","r3"],["c1","c2"]]

df.loc[["r2","r3"]]     df.loc[["r2","r3"],["c1","c2"]]
-------------------     -------------------------------
    c1  c2  c3              c1  c2                     
r2   4   6   9          r2   4   6                     
r3   2   6   7          r3   2   6                     


标签切片, 标签列表, 布尔数组混合做loc()的参数

In [21]:
%C 5 df.loc["r2":"r4", ["c2","c3"]]; df.loc[df.c1>2, ["c1","c2"]]

df.loc["r2":"r4", ["c2","c3"]]     df.loc[df.c1>2, ["c1","c2"]]
------------------------------     ----------------------------
    c2  c3                             c1  c2                  
r2   6   9                         r1   6   3                  
r3   6   7                         r2   4   6                  
r4   3   7                         r4   4   3                  
                                   r5   7   2                  


单个整数值, 整数列表, 整数数组, 布尔数组 做 .iloc()的参数

In [28]:
df.iloc[2] #2默认代表列 下面的整数数组省略了列数组

c1    2
c2    6
c3    7
Name: r3, dtype: int32

In [29]:
%C 5 df.iloc[[2,4]]; df.iloc[[1,3]]; df.iloc[[1,3],[0,2]]

df.iloc[[2,4]]     df.iloc[[1,3]]     df.iloc[[1,3],[0,2]]
--------------     --------------     --------------------
    c1  c2  c3         c1  c2  c3         c1  c3          
r3   2   6   7     r2   4   6   9     r2   4   9          
r5   7   2   5     r4   4   3   7     r4   4   7          


In [30]:
%C 5 df.iloc[2:4, [0,2]]; df.iloc[df.c1.values>2, [0,1]]

df.iloc[2:4, [0,2]]     df.iloc[df.c1.values>2, [0,1]]
-------------------     ------------------------------
    c1  c3                  c1  c2                    
r3   2   7              r1   6   3                    
r4   4   7              r2   4   6                    
                        r4   4   3                    
                        r5   7   2                    


ix()中的参数既可以是整数也可以是标签

In [32]:
%C 5 df.ix[2:4, ["c1", "c3"]]; df.ix["r1":"r3", [0, 2]]

df.ix[2:4, ["c1", "c3"]]     df.ix["r1":"r3", [0, 2]]
------------------------     ------------------------
    c1  c3                       c1  c3              
r3   2   7                   r1   6   7              
r4   4   7                   r2   4   9              
                             r3   2   7              


### 获取单个值

In [35]:
np.random.seed(42)
df = pd.DataFrame(np.random.randint(0, 10, (5, 3)), 
                  index=["r1", "r2", "r3", "r4", "r5"], 
                  columns=["c1", "c2", "c3"])
df

Unnamed: 0,c1,c2,c3
r1,6,3,7
r2,4,6,9
r3,2,6,7
r4,4,3,7
r5,7,2,5


at()和get_value()可以获得单个元素的值

In [39]:
%C 3 df.at["r1", "c1"]; df.iat[0,1]; df.get_value("r1", "c3")

df.at["r1", "c1"]   df.iat[0,1]   df.get_value("r1", "c3")
-----------------   -----------   ------------------------
6                   3             7                       


lookup()可以获得多个标签对对应的值

In [41]:
df.lookup(["r1", "r1", "r1"], ["c1", "c2", "c3"])

array([6, 3, 7])

### 多级标签的存取

**loc[]和at[]下标**(*不是函数loc(),at()*)可以指定多级索引中每级索引上的标签

In [59]:
soil_df = pd.read_csv("data/Soils-simple.csv", 
                      index_col=[0, 1], parse_dates=["Date"])
soil_df

Unnamed: 0_level_0,Unnamed: 1_level_0,pH,Dens,Ca,Conduc,Date,Name
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0-10,Depression,5.4,0.98,11.0,1.5,2015-05-26,Lois
0-10,Slope,5.5,1.1,12.0,2.0,2015-04-30,Roy
0-10,Top,5.3,1.0,13.0,1.4,2015-05-21,Roy
10-30,Depression,4.9,1.4,7.5,5.5,2015-03-21,Lois
10-30,Slope,5.3,1.3,9.5,4.9,2015-02-06,Diana
10-30,Top,4.8,1.3,10.0,3.6,2015-04-11,Diana


某个一级索引的所用二级索引

In [45]:
%C soil_df.loc["10-30", ["pH", "Ca"]]

soil_df.loc["10-30", ["pH", "Ca"]]
----------------------------------
             pH   Ca              
Contour                           
Depression  4.9  7.5              
Slope       5.3  9.5              
Top         4.8   10              


    >行的二级索引如果要做loc[]的下标, 必须包含一级索引的下标, 并且要用np.s_函数把一级索引和二级行索引一并转化为下标

所有一级索引中的某个的二级索引

In [49]:
%C soil_df.loc[np.s_[:, "Top"], ["pH", "Ca"]]

soil_df.loc[np.s_[:, "Top"], ["pH", "Ca"]]
------------------------------------------
                pH   Ca                   
Depth Contour                             
0-10  Top      5.3   13                   
10-30 Top      4.8   10                   


某个一级索引中的某个二级索引

In [51]:
%C soil_df.loc[np.s_["10-30", "Top"], ["pH", "Ca"]]

soil_df.loc[np.s_["10-30", "Top"], ["pH", "Ca"]]
------------------------------------------------
pH    4.8                                       
Ca     10                                       
Name: (10-30, Top), dtype: object               


### `query()`方法

使用过滤条件获得布尔数组

In [65]:
( soil_df.pH > 5 ) & ( soil_df.Ca > 11 )

Depth  Contour   
0-10   Depression    False
       Slope          True
       Top            True
10-30  Depression    False
       Slope         False
       Top           False
dtype: bool

In [66]:
soil_df[ ( soil_df.pH > 5 ) & ( soil_df.Ca > 11 ) ]

Unnamed: 0_level_0,Unnamed: 1_level_0,pH,Dens,Ca,Conduc,Date,Name
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0-10,Slope,5.5,1.1,12,2.0,2015-04-30,Roy
0-10,Top,5.3,1.0,13,1.4,2015-05-21,Roy


query()的参数是一个运算表达式字符串

In [53]:
soil_df.query("pH > 5 and Ca < 11")

Unnamed: 0_level_0,Unnamed: 1_level_0,pH,Dens,Ca,Conduc,Date,Name
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0-10,Depression,5.4,0.98,11.0,1.5,2015-05-26,Lois
10-30,Slope,5.3,1.3,9.5,4.9,2015-02-06,Diana


表达式中如果含有其他全局变量或局部变量的值, 可以在变量名之前添加@

In [55]:
#%hide_output
pH_low = 5
Ca_hi = 11
soil_df.query("pH > @pH_low and Ca < @Ca_hi")

Unnamed: 0_level_0,Unnamed: 1_level_0,pH,Dens,Ca,Conduc,Date,Name
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0-10,Depression,5.4,0.98,11.0,1.5,2015-05-26,Lois
10-30,Slope,5.3,1.3,9.5,4.9,2015-02-06,Diana
