In [2]:
import pandas as pd
import numpy as np
pd.set_option("display.show_dimensions", False)
pd.set_option("display.float_format", "{:4.2g}".format)

### 与`NaN`相关的函数

元素类型为T,F的DataFrame对象叫做布尔类型的DataFrame对象

In [3]:
np.random.seed(41)
df_int = pd.DataFrame(np.random.randint(0, 10, (10, 3)), 
                      columns=list("ABC"))
%C 5 df_int; df_int > 2

  df_int             df_int > 2      
----------     ----------------------
   A  B  C            A      B      C
0  0  3  2     0  False   True  False
1  0  1  3     1  False  False   True
2  9  7  5     2   True   True   True
3  8  3  3     3   True   True   True
4  2  6  0     4  False   True  False
5  4  6  9     5   True   True   True
6  3  8  4     6   True   True   True
7  7  6  1     7   True   True  False
8  5  2  1     8   True  False  False
9  5  3  2     9   True   True  False


In [4]:
df_int["A"] += 10
df_nan = df_int.where(df_int > 2) # where 函数将所有的FALSE转化成NaN
df_nan

Unnamed: 0,A,B,C
0,10,3.0,
1,10,,3.0
2,19,7.0,5.0
3,18,3.0,3.0
4,12,6.0,
5,14,6.0,9.0
6,13,8.0,4.0
7,17,6.0,
8,15,,
9,15,3.0,


In [5]:
%C df_int.dtypes; df_nan.dtypes
#   B C 列含有NaN, 原整数类型变为浮点类型

df_int.dtypes  df_nan.dtypes
-------------  -------------
A    int32     A      int32 
B    int32     B    float64 
C    int32     C    float64 
dtype: object  dtype: object


ianull(), notnull()用于判断DataFrame对象的每个元素是否为NaN

In [6]:
%C 4 df_nan.isnull(); df_nan.notnull()

   df_nan.isnull()           df_nan.notnull()  
----------------------    ---------------------
       A      B      C          A      B      C
0  False  False   True    0  True   True  False
1  False   True  False    1  True  False   True
2  False  False  False    2  True   True   True
3  False  False  False    3  True   True   True
4  False  False   True    4  True   True  False
5  False  False  False    5  True   True   True
6  False  False  False    6  True   True   True
7  False  False   True    7  True   True  False
8  False   True   True    8  True  False  False
9  False  False   True    9  True   True  False


count()函数返回每行或每列非NaN元素的个数

In [7]:
%C 4 df_nan.count(); df_nan.count(axis=1)

df_nan.count()    df_nan.count(axis=1)
--------------    --------------------
A    10           0    2              
B     8           1    2              
C     5           2    3              
dtype: int64      3    3              
                  4    2              
                  5    3              
                  6    3              
                  7    2              
                  8    1              
                  9    2              
                  dtype: int64        


dropna()删除包含NaN的行或列

In [8]:
df_nan

Unnamed: 0,A,B,C
0,10,3.0,
1,10,,3.0
2,19,7.0,5.0
3,18,3.0,3.0
4,12,6.0,
5,14,6.0,9.0
6,13,8.0,4.0
7,17,6.0,
8,15,,
9,15,3.0,


In [9]:
df_nan.dropna() 

Unnamed: 0,A,B,C
2,19,7,5
3,18,3,3
5,14,6,9
6,13,8,4


In [10]:
df_nan.dropna(thresh=2)
# 删除所有NaN个数大于或等于该阈值的行

Unnamed: 0,A,B,C
0,10,3.0,
1,10,,3.0
2,19,7.0,5.0
3,18,3.0,3.0
4,12,6.0,
5,14,6.0,9.0
6,13,8.0,4.0
7,17,6.0,
9,15,3.0,


使用NaN前后的数据进行填充NaN

In [11]:
%C 5 df_nan.ffill(); df_nan.bfill()

 df_nan.ffill()      df_nan.bfill()
---------------     ---------------
    A    B    C         A    B    C
0  10    3  nan     0  10    3    3
1  10    3    3     1  10    7    3
2  19    7    5     2  19    7    5
3  18    3    3     3  18    3    3
4  12    6    3     4  12    6    9
5  14    6    9     5  14    6    9
6  13    8    4     6  13    8    4
7  17    6    4     7  17    6  nan
8  15    6    4     8  15    3  nan
9  15    3    4     9  15    3  nan


 插值填充

In [12]:
df_nan.interpolate()#等间距

Unnamed: 0,A,B,C
0,10,3.0,
1,10,5.0,3.0
2,19,7.0,5.0
3,18,3.0,3.0
4,12,6.0,6.0
5,14,6.0,9.0
6,13,8.0,4.0
7,17,6.0,4.0
8,15,4.5,4.0
9,15,3.0,4.0


In [13]:
s = pd.Series([3, np.NaN, 7], index=[0, 8, 9])
s

0      3
8    nan
9      7
dtype: float64

In [14]:
%C s.interpolate(); s.interpolate(method="index")
# method参数指定插值方法
# 根据index的大小确定 NaN值 与前后值的接近程度

s.interpolate()  s.interpolate(method="index")
---------------  -----------------------------
0      3         0      3                     
8      5         8    6.6                     
9      7         9      7                     
dtype: float64   dtype: float64               


使用字典参数对不同列的 NaN 填充不同的值

In [15]:
print df_nan.fillna({"B":-9, "C":-1})

    A    B    C
0  10    3   -1
1  10   -9    3
2  19    7    5
3  18    3    3
4  12    6   -1
5  14    6    9
6  13    8    4
7  17    6   -1
8  15   -9   -1
9  15    3   -1


In [28]:
df_nan

Unnamed: 0,A,B,C
0,10,3.0,
1,10,,3.0
2,19,7.0,5.0
3,18,3.0,3.0
4,12,6.0,
5,14,6.0,9.0
6,13,8.0,4.0
7,17,6.0,
8,15,,
9,15,3.0,


聚合方法的skipna参数默认为True,表示忽略NaN元素;若为False, 则包含NaN的行或列的运算结果为NaN

In [32]:
%C df_nan.sum(); df_nan.sum(skipna=False)
#  忽略NaN元素  不跳过NaN时包含NaN的行或者列的运算结果为NaN

 df_nan.sum()   df_nan.sum(skipna=False)
--------------  ------------------------
A   1.4e+02     A   1.4e+02             
B        42     B       nan             
C        24     C       nan             
dtype: float64  dtype: float64          


忽略含有NaN的行

In [34]:
df_nan.dropna().sum()
#  dropana先删除含有NaN的整行

A     64
B     24
C     21
dtype: float64

df.combine_first(other) 使用另一个数据表填充df中的NaN元素.

In [36]:
df_other = pd.DataFrame(np.random.randint(0, 10, (4, 2)), 
                        columns=["B", "C"], 
                        index=[1, 2, 8, 9])
df_other

Unnamed: 0,B,C
1,0,8
2,2,0
8,4,7
9,3,4


In [38]:
%C 6 df_nan; df_nan.combine_first(df_other)

     df_nan          df_nan.combine_first(df_other)
---------------      ------------------------------
    A    B    C          A    B    C               
0  10    3  nan      0  10    3  nan               
1  10  nan    3      1  10    0    3               
2  19    7    5      2  19    7    5               
3  18    3    3      3  18    3    3               
4  12    6  nan      4  12    6  nan               
5  14    6    9      5  14    6    9               
6  13    8    4      6  13    8    4               
7  17    6  nan      7  17    6  nan               
8  15  nan  nan      8  15    4    7               
9  15    3  nan      9  15    3    4               


### 改变DataFrame的形状

| 函数名     | 功能               |
| :------------| :------------------------------------|
| concat     | 拼接多块数据       |
| set_index   | 设置索引          |
| stack      | 将列索引转化为行索引  |
| reorder_levels| 设置索引级别的顺序   |
| sort_index   | 对索引排序         |
| melt       | 透视表的逆变换       |
|:|:|
| drop       | 删除行或列         |
| reset_index  | 将行索引转换为列     |
| unstack     | 将行索引转换为列索引 |
| swaplevel   | 交换索引中两个级别的顺序  |
| pivot      | 创建透视表           |
| assign     | 返回添加新列之后的数据   |









In [43]:
soils = pd.read_csv("Soils.csv", index_col=0)[["Depth", "Contour",\
                                               "Group", "pH", "N"]]
soils.head()

Unnamed: 0,Depth,Contour,Group,pH,N
1,0-10,Top,1,5.4,0.19
2,0-10,Top,1,5.7,0.17
3,0-10,Top,1,5.1,0.26
4,0-10,Top,1,5.1,0.17
5,10-30,Top,2,5.1,0.16


In [44]:
soils_mean = soils.groupby(["Depth", "Contour"]).mean()
soils_mean.head() # 行索引是多级索引

Unnamed: 0_level_0,Unnamed: 1_level_0,Group,pH,N
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-10,Depression,9,5.4,0.18
0-10,Slope,5,5.5,0.22
0-10,Top,1,5.3,0.2
10-30,Depression,10,4.9,0.08
10-30,Slope,6,5.3,0.1


#### 添加删除列或行

DataFrame[colname] = values 添加新列

In [49]:
soils["N_percent"] = soils.eval("N * 100") # eval是对列求值的函数
soils.head()

Unnamed: 0,Depth,Contour,Group,pH,N,N_percent
1,0-10,Top,1,5.4,0.19,19
2,0-10,Top,1,5.7,0.17,16
3,0-10,Top,1,5.1,0.26,26
4,0-10,Top,1,5.1,0.17,17
5,10-30,Top,2,5.1,0.16,16


assign()添加由关键字参数指定的列, 返回新的DataFrame对象, 元数据的内容不变

In [50]:
print soils.assign(pH2 = soils.pH + 1).head() 


   Depth Contour  Group   pH    N  N_percent  pH2
1   0-10     Top      1  5.4 0.19         19  6.4
2   0-10     Top      1  5.7 0.17         16  6.7
3   0-10     Top      1  5.1 0.26         26  6.1
4   0-10     Top      1  5.1 0.17         17  6.1
5  10-30     Top      2  5.1 0.16         16  6.1


*append()方法用于添加行

In [20]:
def random_dataframe(n):
    columns = ["A", "B", "C"]
    for i in range(n):
        nrow = np.random.randint(10, 20)
        yield pd.DataFrame(np.random.randint(0, 100, size=(nrow, 3)), columns=columns)

df_list = list(random_dataframe(1000))

In [21]:
%%time
df_res1 = pd.DataFrame([])
for df in df_list:
    df_res1 = df_res1.append(df)

Wall time: 910 ms


In [None]:
%%time
df_res2 = pd.concat(df_list, axis=0)
df_res3 = pd.concat(df_list, axis=0, keys=range(len(df_list)))
df_res3.loc[30].equals(df_list[30])

drop()函数删除指定标签对应的行或列

In [56]:
print soils.drop(["N", "Group"], axis=1).head()

   Depth Contour   pH  N_percent
1   0-10     Top  5.4         19
2   0-10     Top  5.7         16
3   0-10     Top  5.1         26
4   0-10     Top  5.1         17
5  10-30     Top  5.1         16


#### 行索引与列之间相互转换

>行索引与列之间相互转换不应改变对应标签的值

In [57]:
soils_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Group,pH,N
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-10,Depression,9,5.4,0.18
0-10,Slope,5,5.5,0.22
0-10,Top,1,5.3,0.2
10-30,Depression,10,4.9,0.08
10-30,Slope,6,5.3,0.1
10-30,Top,2,4.8,0.12
30-60,Depression,11,4.4,0.051
30-60,Slope,7,4.3,0.061
30-60,Top,3,4.2,0.08
60-90,Depression,12,4.2,0.04


reset_index 将行索引转化为列

In [58]:
print soils_mean.reset_index(level="Contour").head()

          Contour  Group   pH    N
Depth                             
0-10   Depression      9  5.4 0.18
0-10        Slope      5  5.5 0.22
0-10          Top      1  5.3  0.2
10-30  Depression     10  4.9 0.08
10-30       Slope      6  5.3  0.1


set_index 将列转化为行索引

In [61]:
print soils_mean.set_index("Group", append=True).head() 
# 保存先前的行索引

                         pH    N
Depth Contour    Group          
0-10  Depression 9      5.4 0.18
      Slope      5      5.5 0.22
      Top        1      5.3  0.2
10-30 Depression 10     4.9 0.08
      Slope      6      5.3  0.1


#### 行和列的索引相互转换

In [64]:
soils_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Group,pH,N
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-10,Depression,9,5.4,0.18
0-10,Slope,5,5.5,0.22
0-10,Top,1,5.3,0.2
10-30,Depression,10,4.9,0.08
10-30,Slope,6,5.3,0.1
10-30,Top,2,4.8,0.12
30-60,Depression,11,4.4,0.051
30-60,Slope,7,4.3,0.061
30-60,Top,3,4.2,0.08
60-90,Depression,12,4.2,0.04


stack()列索引转化为行索引, unstack()将行索引转化为列索引

In [62]:
print soils_mean.unstack(1)[["Group", "pH"]].head()

             Group                   pH           
Contour Depression Slope Top Depression Slope  Top
Depth                                             
0-10             9     5   1        5.4   5.5  5.3
10-30           10     6   2        4.9   5.3  4.8
30-60           11     7   3        4.4   4.3  4.2
60-90           12     8   4        4.2   3.9  3.9


所有的索引转换到同一个轴上, 将得到一个Series对象

In [66]:
print soils_mean.stack().head(10)

Depth  Contour          
0-10   Depression  Group      9
                   pH       5.4
                   N       0.18
       Slope       Group      5
                   pH       5.5
                   N       0.22
       Top         Group      1
                   pH       5.3
                   N        0.2
10-30  Depression  Group     10
dtype: float64


#### 交换索引的等级

reorder_level()和swaplevel()交换指定轴的索引级别

In [68]:
soils_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Group,pH,N
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-10,Depression,9,5.4,0.18
0-10,Slope,5,5.5,0.22
0-10,Top,1,5.3,0.2
10-30,Depression,10,4.9,0.08
10-30,Slope,6,5.3,0.1
10-30,Top,2,4.8,0.12
30-60,Depression,11,4.4,0.051
30-60,Slope,7,4.3,0.061
30-60,Top,3,4.2,0.08
60-90,Depression,12,4.2,0.04


In [69]:
print soils_mean.swaplevel(0, 1).sort_index()

                  Group   pH     N
Contour    Depth                  
Depression 0-10       9  5.4  0.18
           10-30     10  4.9  0.08
           30-60     11  4.4 0.051
           60-90     12  4.2  0.04
Slope      0-10       5  5.5  0.22
           10-30      6  5.3   0.1
           30-60      7  4.3 0.061
           60-90      8  3.9 0.043
Top        0-10       1  5.3   0.2
           10-30      2  4.8  0.12
           30-60      3  4.2  0.08
           60-90      4  3.9 0.058


>关于索引控制DataFrame形状的小节:
* DataFrame的形状改变但是原始标签对对应的值不变
* 所有交换索引的方法返回一个新的数据表, 不会对原始数据表产生影响

#### 透视表

pivot()可以将DataFrame中的三列数据分别作为行索引, 列索引和元素值, 将这三列数据转化为二维表格

In [73]:
soils_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Group,pH,N
Depth,Contour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-10,Depression,9,5.4,0.18
0-10,Slope,5,5.5,0.22
0-10,Top,1,5.3,0.2
10-30,Depression,10,4.9,0.08
10-30,Slope,6,5.3,0.1
10-30,Top,2,4.8,0.12
30-60,Depression,11,4.4,0.051
30-60,Slope,7,4.3,0.061
30-60,Top,3,4.2,0.08
60-90,Depression,12,4.2,0.04


In [72]:
df = soils_mean.reset_index()[["Depth", "Contour", "pH", "N"]]
df

Unnamed: 0,Depth,Contour,pH,N
0,0-10,Depression,5.4,0.18
1,0-10,Slope,5.5,0.22
2,0-10,Top,5.3,0.2
3,10-30,Depression,4.9,0.08
4,10-30,Slope,5.3,0.1
5,10-30,Top,4.8,0.12
6,30-60,Depression,4.4,0.051
7,30-60,Slope,4.3,0.061
8,30-60,Top,4.2,0.08
9,60-90,Depression,4.2,0.04


In [74]:
df_pivot_pH = df.pivot("Depth", "Contour", "pH")
df_pivot_pH

Contour,Depression,Slope,Top
Depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-10,5.4,5.5,5.3
10-30,4.9,5.3,4.8
30-60,4.4,4.3,4.2
60-90,4.2,3.9,3.9


pivot()的三个参数index, columns, values 只支持指定一列数据, 如不指定values参数
就将剩余的列都当做元素的值, 得到多级索引

In [76]:
df.pivot("Depth", "Contour")

Unnamed: 0_level_0,pH,pH,pH,N,N,N
Contour,Depression,Slope,Top,Depression,Slope,Top
Depth,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0-10,5.4,5.5,5.3,0.18,0.22,0.2
10-30,4.9,5.3,4.8,0.08,0.1,0.12
30-60,4.4,4.3,4.2,0.051,0.061,0.08
60-90,4.2,3.9,3.9,0.04,0.043,0.058


In [None]:
melt()可以看做pivot()的逆变换

In [77]:
df_pivot_pH

Contour,Depression,Slope,Top
Depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-10,5.4,5.5,5.3
10-30,4.9,5.3,4.8
30-60,4.4,4.3,4.2
60-90,4.2,3.9,3.9


In [78]:
df_before_melt = df_pivot_pH.reset_index()
df_before_melt

Contour,Depth,Depression,Slope,Top
0,0-10,5.4,5.5,5.3
1,10-30,4.9,5.3,4.8
2,30-60,4.4,4.3,4.2
3,60-90,4.2,3.9,3.9


In [79]:
df_after_melt = pd.melt(df_before_melt, id_vars="Depth", value_name="pH")
df_after_melt

Unnamed: 0,Depth,Contour,pH
0,0-10,Depression,5.4
1,10-30,Depression,4.9
2,30-60,Depression,4.4
3,60-90,Depression,4.2
4,0-10,Slope,5.5
5,10-30,Slope,5.3
6,30-60,Slope,4.3
7,60-90,Slope,3.9
8,0-10,Top,5.3
9,10-30,Top,4.8
