In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 4.1 数据的聚合

### 4.1.1 最小值和最大值

4.1 Series 类的 min 方法和 max 方法

In [2]:
ser = pd.Series([0,1,2,3,4])
print('最小值', ser.min())
print('最大值', ser.max())

最小值 0
最大值 4


4.2 DataFrame 类的 min 方法

In [3]:
val = np.arange(0,9).reshape(3,3)
df = pd.DataFrame(val, index=list('edf'), columns=list('abc'))
df

Unnamed: 0,a,b,c
e,0,1,2
d,3,4,5
f,6,7,8


In [4]:
df.min(axis=0)

a    0
b    1
c    2
dtype: int32

In [5]:
df.min(axis=1)

e    0
d    3
f    6
dtype: int32

4.3 DataFrame 类的 max 方法

In [6]:
df.max(axis=0)

a    6
b    7
c    8
dtype: int32

In [7]:
df.max(axis=1)

e    2
d    5
f    8
dtype: int32

### 4.1.2 平均值、中位数和众数

4.4 DataFrame 类的 mean 方法

In [9]:
df.mean(axis=0)

a    3.0
b    4.0
c    5.0
dtype: float64

In [10]:
df.mean(axis=1)

e    1.0
d    4.0
f    7.0
dtype: float64

4.5 DataFrame 类的 median 方法

In [11]:
df = pd.DataFrame([[1,3,100,102,106,115,110]])
df

Unnamed: 0,0,1,2,3,4,5,6
0,1,3,100,102,106,115,110


In [12]:
df.median(axis=1)

0    102.0
dtype: float64

4.6 向 DataFrame 对象添加元素

In [13]:
df[7] = 120
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,3,100,102,106,115,110,120


4.7 DataFrame 类的 median 方法

In [14]:
df.median(axis=1)

0    104.0
dtype: float64

4.8 DataFrame 类的 mean 方法

In [15]:
df.mean(axis=1)

0    82.125
dtype: float64

4.9 DataFrame 类的 mode 方法

In [16]:
df = pd.DataFrame([[1,1,2,3,4]])
df

Unnamed: 0,0,1,2,3,4
0,1,1,2,3,4


In [17]:
df.mode(axis=1)

Unnamed: 0,0
0,1


4.10 DataFrame 类的 mode 方法

In [18]:
df = pd.DataFrame([[5,6,7,8]])
df.mode(axis=1)

Unnamed: 0,0,1,2,3
0,5,6,7,8


### 4.1.3 标准差

4.11 DataFrame 类的 mean 方法

In [19]:
table_a = pd.DataFrame([[152,151,150,147,181,190,187,149,196]])
table_a.mean(axis=1)

0    167.0
dtype: float64

In [20]:
table_b = pd.DataFrame([[162,161,165,147,161,175,187,175,170]])
table_b.mean(axis=1)

0    167.0
dtype: float64

4.12 将表 4.1 的平均值、每个值、数据的个数保存到相应变量中

In [21]:
mean = table_a.mean(axis=1).values[0]
mean

167.0

In [22]:
data = table_a.values[0]
data

array([152, 151, 150, 147, 181, 190, 187, 149, 196], dtype=int64)

In [23]:
n = table_a.shape[1]
n

9

4.13 求取标准差的函数

In [24]:
def standard_deviation(data, mean, n):
    std = 0
    for num in data:
        std += (num - mean) ** 2
        if num == data[n-1]:
            std = (1/n) * std
            std = np.sqrt(std)
            print(std)

4.14 使用 standard_deviation 函数计算表 4.1 的标准差

In [25]:
standard_deviation(data,mean,n)

19.60725489313699


4.15 使用 standard_deviation 函数计算表 4.2 的标准差

In [26]:
data = table_b.values[0]
mean = table_b.mean(axis=1)[0]
n = table_b.shape[1]

standard_deviation(data, mean, n)

10.739335795724674


4.16 DataFrame 类的 std 方法

In [27]:
print('table_a : ', table_a.std(axis=1, ddof=0)[0])
print('table_b : ', table_b.std(axis=1, ddof=0)[0])

table_a :  19.60725489313699
table_b :  10.739335795724674


4.17 std 方法和 ddof 参数

In [28]:
print('table_a : ', table_a.std(axis=1, ddof=1)[0])
print('table_b : ', table_b.std(axis=1, ddof=1)[0])

table_a :  20.796634343085422
table_b :  11.390785749894517


### 4.1.4 分位数

4.18 DataFrame 类的 quantile 方法

In [29]:
df = pd.DataFrame([[1,2,3,4,5,6,7,8,9,10]])
print(df.quantile(q=0.5, axis=1))
print(df.median(axis=1))

0    5.5
Name: 0.5, dtype: float64
0    5.5
dtype: float64


4.19 quantile 方法和 q 参数

In [30]:
df.quantile(q=[0.25,0.5,0.75], axis=1)

Unnamed: 0,0
0.25,3.25
0.5,5.5
0.75,7.75


4.20 quantile 方法的 interpolation 参数（linear）

In [31]:
df.quantile(q=0.25, axis=1, interpolation='linear')

0    3.25
Name: 0.25, dtype: float64

4.21 手动计算分位数

In [32]:
n = df.shape[1]
print(n)

10


In [33]:
import math
frac, num = math.modf((n-1)*0.25)
print('frac = ',frac)
print('num = ',num)

frac =  0.25
num =  2.0


4.22 手动计算分位数

In [34]:
i, j = 3, 4
print(i + (j-i)*frac)

3.25


4.23 interpolation 参数的 lower 和 higher

In [35]:
df.quantile(q=0.25, axis=1, interpolation='lower')

0    3
Name: 0.25, dtype: int64

In [36]:
df.quantile(q=0.25, axis=1, interpolation='higher')

0    4
Name: 0.25, dtype: int64

4.24 interpolation 参数的 nearest

In [37]:
df.quantile(q=0.25, axis=1, interpolation='nearest')

0    3
Name: 0.25, dtype: int64

4.25 interpolation 参数的 midpoint

In [38]:
df.quantile(q=0.25, axis=1, interpolation='midpoint')

0    3.5
Name: 0.25, dtype: float64

### 4.1.5 累积和和累计积

4.26 DataFrame 对象的创建

In [39]:
val = np.arange(0,9).reshape(3, 3)
df = pd.DataFrame(val, index=list('def'), columns=list('abc'))
df

Unnamed: 0,a,b,c
d,0,1,2
e,3,4,5
f,6,7,8


4.27 DataFrame 类的 cumsum 方法

In [40]:
df.cumsum(axis=0)

Unnamed: 0,a,b,c
d,0,1,2
e,3,5,7
f,9,12,15


4.28 DataFrame 类的 cumsum 方法（axis=1）

In [41]:
df.cumsum(axis=1)

Unnamed: 0,a,b,c
d,0,1,3
e,3,7,12
f,6,13,21


4.29 cumsum 方法的 skipna 参数

In [42]:
df.loc['e','a'] = np.nan
df.loc['f','b'] = np.nan

In [43]:
df

Unnamed: 0,a,b,c
d,0.0,1.0,2
e,,4.0,5
f,6.0,,8


In [44]:
df.cumsum(axis=1, skipna=False)

Unnamed: 0,a,b,c
d,0.0,1.0,3.0
e,,,
f,6.0,,


4.30 DataFrame 类的 cumprod 方法

In [45]:
df.cumprod(axis=1)

Unnamed: 0,a,b,c
d,0.0,0.0,0.0
e,,4.0,20.0
f,6.0,,48.0


In [46]:
df.cumprod(axis=0)

Unnamed: 0,a,b,c
d,0.0,1.0,2
e,,4.0,10
f,0.0,,80


In [47]:
df.cumprod(axis=0, skipna=False)

Unnamed: 0,a,b,c
d,0.0,1.0,2
e,,4.0,10
f,,,80


4.31 Series 类的 cumsum 方法和 cumprod 方法

In [48]:
ser = pd.Series([1,3,5])
ser.cumsum()

0    1
1    4
2    9
dtype: int64

In [49]:
ser.cumprod()

0     1
1     3
2    15
dtype: int64

### 4.1.6 分箱处理

4.32 Series 对象的创建

In [50]:
age = pd.Series([12,14,26,28,30,32,44,58])
age

0    12
1    14
2    26
3    28
4    30
5    32
6    44
7    58
dtype: int64

4.33 cut 函数

In [51]:
pd.cut(x=age, bins = [0,10,19,29,39,49,59])

0    (10, 19]
1    (10, 19]
2    (19, 29]
3    (19, 29]
4    (29, 39]
5    (29, 39]
6    (39, 49]
7    (49, 59]
dtype: category
Categories (6, interval[int64, right]): [(0, 10] < (10, 19] < (19, 29] < (29, 39] < (39, 49] < (49, 59]]

4.34 cut 函数的 bins 参数

In [52]:
pd.cut(x=age, bins=[0,19,60])

0     (0, 19]
1     (0, 19]
2    (19, 60]
3    (19, 60]
4    (19, 60]
5    (19, 60]
6    (19, 60]
7    (19, 60]
dtype: category
Categories (2, interval[int64, right]): [(0, 19] < (19, 60]]

4.35 cut 函数的 labels 参数

In [53]:
pd.cut(x=age, bins=[0,19,60], labels=['non-adult','adult'])

0    non-adult
1    non-adult
2        adult
3        adult
4        adult
5        adult
6        adult
7        adult
dtype: category
Categories (2, object): ['non-adult' < 'adult']

4.36 cut 函数的 right 参数

In [54]:
pd.cut(x=age, bins=[0,19,60], right=False)

0     [0, 19)
1     [0, 19)
2    [19, 60)
3    [19, 60)
4    [19, 60)
5    [19, 60)
6    [19, 60)
7    [19, 60)
dtype: category
Categories (2, interval[int64, left]): [[0, 19) < [19, 60)]

4.37 cut 函数和 value_counts 方法

In [55]:
age_cat = pd.cut(x=age, bins=[0,19,60], labels=['non-adult','adult'])
age_cat.value_counts()

adult        6
non-adult    2
Name: count, dtype: int64

4.38 qcut 函数

In [56]:
age = pd.Series([12,14,26,28,30,32,44,58])
pd.qcut(x=age, q=2)

0    (11.999, 29.0]
1    (11.999, 29.0]
2    (11.999, 29.0]
3    (11.999, 29.0]
4      (29.0, 58.0]
5      (29.0, 58.0]
6      (29.0, 58.0]
7      (29.0, 58.0]
dtype: category
Categories (2, interval[float64, right]): [(11.999, 29.0] < (29.0, 58.0]]

In [57]:
pd.qcut(x=age, q=4)

0    (11.999, 23.0]
1    (11.999, 23.0]
2      (23.0, 29.0]
3      (23.0, 29.0]
4      (29.0, 35.0]
5      (29.0, 35.0]
6      (35.0, 58.0]
7      (35.0, 58.0]
dtype: category
Categories (4, interval[float64, right]): [(11.999, 23.0] < (23.0, 29.0] < (29.0, 35.0] < (35.0, 58.0]]

4.39 qcut 函数的 q 参数

In [58]:
pd.qcut(x=age, q=[0,0.25,0.5,0.75,1])

0    (11.999, 23.0]
1    (11.999, 23.0]
2      (23.0, 29.0]
3      (23.0, 29.0]
4      (29.0, 35.0]
5      (29.0, 35.0]
6      (35.0, 58.0]
7      (35.0, 58.0]
dtype: category
Categories (4, interval[float64, right]): [(11.999, 23.0] < (23.0, 29.0] < (29.0, 35.0] < (35.0, 58.0]]

4.40 qcut 函数的 labels 参数

In [59]:
pd.qcut(x=age, q=2, labels=['younger','older'])

0    younger
1    younger
2    younger
3    younger
4      older
5      older
6      older
7      older
dtype: category
Categories (2, object): ['younger' < 'older']

4.41 qcut 函数的 retbins 参数

In [60]:
bins, labels = pd.qcut(x=age, q=2, retbins=True)

In [61]:
bins

0    (11.999, 29.0]
1    (11.999, 29.0]
2    (11.999, 29.0]
3    (11.999, 29.0]
4      (29.0, 58.0]
5      (29.0, 58.0]
6      (29.0, 58.0]
7      (29.0, 58.0]
dtype: category
Categories (2, interval[float64, right]): [(11.999, 29.0] < (29.0, 58.0]]

In [62]:
labels

array([12., 29., 58.])

### 4.1.7 概括统计量

4.42 DataFrame 类的 describe 方法

In [63]:
np.random.seed(seed=1)
val = np.random.randint(0,100,size=9).reshape(3,3)
df = pd.DataFrame(val, columns=list('abc'))
df.describe()

Unnamed: 0,a,b,c
count,3.0,3.0,3.0
mean,41.666667,50.333333,31.0
std,35.232561,33.650161,35.930488
min,9.0,12.0,5.0
25%,23.0,38.0,10.5
50%,37.0,64.0,16.0
75%,58.0,69.5,44.0
max,79.0,75.0,72.0


4.43 使用 describe 方法处理缺失值

In [64]:
df.loc[1,'a'] = np.nan
df.describe()

Unnamed: 0,a,b,c
count,2.0,3.0,3.0
mean,58.0,50.333333,31.0
std,29.698485,33.650161,35.930488
min,37.0,12.0,5.0
25%,47.5,38.0,10.5
50%,58.0,64.0,16.0
75%,68.5,69.5,44.0
max,79.0,75.0,72.0


4.44 describe 方法的 percentiles 参数

In [65]:
df.describe(percentiles=[0.1,0.2,0.3])

Unnamed: 0,a,b,c
count,2.0,3.0,3.0
mean,58.0,50.333333,31.0
std,29.698485,33.650161,35.930488
min,37.0,12.0,5.0
10%,41.2,22.4,7.2
20%,45.4,32.8,9.4
30%,49.6,43.2,11.6
50%,58.0,64.0,16.0
max,79.0,75.0,72.0


4.45 对象类型的概括统计量

In [66]:
df['e'] = list('aba')
df['e'].describe()

count     3
unique    2
top       a
freq      2
Name: e, dtype: object

4.46 包含两个以上众数的 describe 方法

In [67]:
df.loc[3, 'e'] = 'b'
df['e'].describe()

count     4
unique    2
top       a
freq      2
Name: e, dtype: object

### 4.1.8 数据透视表

4.47 DataFrame 对象的创建

In [68]:
np.random.seed(seed=1)
scores = np.random.randint(70,100,size=25).reshape(5, 5)
subs = ['math','eng','scie','art','hist']
df = pd.DataFrame(scores, columns=subs)
df['club'] = ['soccer','tennis','tennis','soccer','tennis']
df['sex'] = list('MMFMF')
df

Unnamed: 0,math,eng,scie,art,hist,club,sex
0,75,81,82,78,79,soccer,M
1,81,75,85,70,86,tennis,M
2,71,82,77,83,98,tennis,F
3,76,95,88,90,75,soccer,M
4,88,90,81,98,80,tennis,F


4.48 pivot_table 函数

In [69]:
# 将 'value' 列转换为数值类型（忽略无法转换的值）
df['club'] = pd.to_numeric(df['club'], errors='coerce')
df

Unnamed: 0,math,eng,scie,art,hist,club,sex
0,75,81,82,78,79,,M
1,81,75,85,70,86,,M
2,71,82,77,83,98,,F
3,76,95,88,90,75,,M
4,88,90,81,98,80,,F


In [70]:
pd.pivot_table(df, index='sex', aggfunc="mean")

Unnamed: 0_level_0,art,eng,hist,math,scie
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,90.5,86.0,89.0,79.5,79.0
M,79.333333,83.666667,80.0,77.333333,85.0


4.49 pivot_table 函数的 values 参数

In [71]:
pd.pivot_table(df, index='sex',values='art')

Unnamed: 0_level_0,art
sex,Unnamed: 1_level_1
F,90.5
M,79.333333


In [72]:
pd.pivot_table(df, index='sex',values=['art','eng'])

Unnamed: 0_level_0,art,eng
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,90.5,86.0
M,79.333333,83.666667


4.50 pivot_table 函数的 columns 参数

In [73]:
np.random.seed(seed=1)
scores = np.random.randint(70,100,size=25).reshape(5, 5)
subs = ['math','eng','scie','art','hist']
df = pd.DataFrame(scores, columns=subs)
df['club'] = ['soccer','tennis','tennis','soccer','tennis']
df['sex'] = list('MMFMF')
df

Unnamed: 0,math,eng,scie,art,hist,club,sex
0,75,81,82,78,79,soccer,M
1,81,75,85,70,86,tennis,M
2,71,82,77,83,98,tennis,F
3,76,95,88,90,75,soccer,M
4,88,90,81,98,80,tennis,F


In [74]:
pd.pivot_table(df, index='club', columns='sex', aggfunc="mean")

Unnamed: 0_level_0,art,art,eng,eng,hist,hist,math,math,scie,scie
sex,F,M,F,M,F,M,F,M,F,M
club,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
soccer,,84.0,,88.0,,77.0,,75.5,,85.0
tennis,90.5,70.0,86.0,75.0,89.0,86.0,79.5,81.0,79.0,85.0


4.51 在 index 参数中指定多个标签

In [75]:
pd.pivot_table(df, index=['sex','club'], aggfunc="mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,art,eng,hist,math,scie
sex,club,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,tennis,90.5,86.0,89.0,79.5,79.0
M,soccer,84.0,88.0,77.0,75.5,85.0
M,tennis,70.0,75.0,86.0,81.0,85.0


4.52 在 aggfunc 参数中指定多个 NumPy 数组的函数

In [76]:
pd.pivot_table(
    df,
    index=['sex','club'],
    values='eng',
    aggfunc=[np.sum,np.max,np.min])

  pd.pivot_table(
  pd.pivot_table(
  pd.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,sum,max,min
Unnamed: 0_level_1,Unnamed: 1_level_1,eng,eng,eng
sex,club,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
F,tennis,172,90,82
M,soccer,176,95,81
M,tennis,75,75,75


4.53 在 aggfunc 参数中指定字典

In [77]:
pd.pivot_table(
    df,
    index=['sex','club'],
    aggfunc={'eng':np.max,'math':np.mean}
)

  pd.pivot_table(
  pd.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,eng,math
sex,club,Unnamed: 2_level_1,Unnamed: 3_level_1
F,tennis,90,79.5
M,soccer,95,75.5
M,tennis,75,81.0


4.54 pivot_table 函数的 margins 参数

In [78]:
df

Unnamed: 0,math,eng,scie,art,hist,club,sex
0,75,81,82,78,79,soccer,M
1,81,75,85,70,86,tennis,M
2,71,82,77,83,98,tennis,F
3,76,95,88,90,75,soccer,M
4,88,90,81,98,80,tennis,F


In [79]:
pd.pivot_table(
    df,
    index='club',
    values='math',
    aggfunc=np.mean,
    margins=True
)

  pd.pivot_table(


Unnamed: 0_level_0,math
club,Unnamed: 1_level_1
soccer,75.5
tennis,80.0
All,78.2


4.55 pivot_table 函数的 margins 参数和 columns 参数

In [80]:
pd.pivot_table(
    df,
    index='club',
    columns='sex',
    values='math',
    aggfunc=np.mean,
    margins=True
)

  pd.pivot_table(
  pd.pivot_table(
  pd.pivot_table(


sex,F,M,All
club,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
soccer,,75.5,75.5
tennis,79.5,81.0,80.0
All,79.5,77.333333,78.2


### 4.1.9 交叉表

4.56 DataFrame 对象的创建

In [81]:
np.random.seed(seed=1)
sex = np.random.choice(['M','F'], size=10)
eva = np.random.randint(70, 100, size=10)
city = np.random.choice(['Tokyo','Osaka','Sapporo'], size=10)
div = np.random.choice(['sales','hr','marketing','dev'], size=10)
dic = {'sex':sex, 'evaluation':eva, 'city':city, 'division':div}
df = pd.DataFrame(dic)
df

Unnamed: 0,sex,evaluation,city,division
0,F,86,Sapporo,sales
1,F,71,Osaka,dev
2,M,82,Sapporo,dev
3,M,77,Tokyo,hr
4,F,83,Tokyo,hr
5,F,98,Sapporo,dev
6,F,76,Tokyo,marketing
7,F,95,Osaka,sales
8,F,88,Sapporo,marketing
9,M,90,Sapporo,hr


4.57 crosstab 函数

In [82]:
pd.crosstab(index=df['sex'], columns=df['city'])

city,Osaka,Sapporo,Tokyo
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,2,3,2
M,0,2,1


4.58 crosstab 函数的 index 参数和 columns 参数

In [83]:
pd.crosstab(index=[df['sex'], df['city']], columns=df['division'])

Unnamed: 0_level_0,division,dev,hr,marketing,sales
sex,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,Osaka,1,0,0,1
F,Sapporo,1,0,1,1
F,Tokyo,0,1,1,0
M,Sapporo,1,1,0,0
M,Tokyo,0,1,0,0


4.59 crosstab 函数的 margins 参数

In [84]:
pd.crosstab(
    index=[df['sex'], df['city']],
    columns=df['division'],
    margins=True
)

Unnamed: 0_level_0,division,dev,hr,marketing,sales,All
sex,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,Osaka,1,0,0,1,2
F,Sapporo,1,0,1,1,3
F,Tokyo,0,1,1,0,2
M,Sapporo,1,1,0,0,2
M,Tokyo,0,1,0,0,1
All,,3,3,2,2,10


4.60 crosstab 函数的 aggfunc 参数

In [85]:
pd.crosstab(
    index=df['sex'],
    columns=df['city'],
    values=df['evaluation'],
    aggfunc=np.mean
)

  pd.crosstab(


city,Osaka,Sapporo,Tokyo
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,83.0,90.666667,79.5
M,,86.0,77.0


## 4.2 数据的排序

### 4.2.1 基于标签的排序

4.61 Series 类的 sort_index 方法

In [86]:
ser = pd.Series([5,1,0,9,3],index=list('acdbe'))
ser.sort_index()

a    5
b    9
c    1
d    0
e    3
dtype: int64

4.62 sort_index 方法的 ascending 参数

In [87]:
ser = pd.Series([5,1,0,9,3],index=list('acdbe'))
ser.sort_index(ascending=False)

e    3
d    0
c    1
b    9
a    5
dtype: int64

4.63 DataFrame 对象的创建

In [88]:
np.random.seed(seed=1)
val = np.random.randint(0,10,size=9,).reshape(3,3)
df = pd.DataFrame(val, index=[2,0,1],columns=list('cba'))
df

Unnamed: 0,c,b,a
2,5,8,9
0,5,0,0
1,1,7,6


4.64 sort_index 方法的 axis 参数

In [89]:
df.sort_index(axis=0)

Unnamed: 0,c,b,a
0,5,0,0
1,1,7,6
2,5,8,9


In [90]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c
2,9,8,5
0,0,0,5
1,6,7,1


4.65 为索引标签添加缺失值

In [91]:
df.index = [2,np.nan,1]
df

Unnamed: 0,c,b,a
2.0,5,8,9
,5,0,0
1.0,1,7,6


4.66 sort_index 方法的 na_position 参数

In [92]:
df.sort_index(na_position='first')

Unnamed: 0,c,b,a
,5,0,0
1.0,1,7,6
2.0,5,8,9


### 4.2.2 基于元素的排序

4.67 Series 类的 sort_values 方法

In [93]:
ser = pd.Series([4,1,0,3,2])
ser.sort_values()

2    0
1    1
4    2
3    3
0    4
dtype: int64

4.68 sort_values 方法的 ascending 参数

In [94]:
ser = pd.Series(list("fnmdp"))
ser.sort_values(ascending=False)

4    p
1    n
2    m
0    f
3    d
dtype: object

4.69 DataFrame 对象的创建

In [95]:
np.random.seed(seed=10)
scores = np.random.randint(70,100,size=20,).reshape(5,4)
df = pd.DataFrame(scores, columns=['math','eng','chem','phys'])
df

Unnamed: 0,math,eng,chem,phys
0,79,99,74,85
1,70,87,97,98
2,95,99,86,99
3,87,96,78,79
4,70,80,78,92


4.70 DataFrame 类的 sort_values 方法的 by 参数

In [96]:
df.sort_values(by='math')

Unnamed: 0,math,eng,chem,phys
1,70,87,97,98
4,70,80,78,92
0,79,99,74,85
3,87,96,78,79
2,95,99,86,99


4.71 使用 sort_values 方法的 by 参数指定多个列

df.sort_values(by=['math','eng'])

4.72 sort_values 方法的 inplace 参数

In [97]:
df.sort_values(by='phys', ascending=False, inplace=True)
df

Unnamed: 0,math,eng,chem,phys
2,95,99,86,99
1,70,87,97,98
4,70,80,78,92
0,79,99,74,85
3,87,96,78,79


4.73 Series 类的 nlargest 方法

In [98]:
df['eng'].nlargest(3)

2    99
0    99
3    96
Name: eng, dtype: int32

4.74 Series 类的 nsmallest 方法

In [99]:
df['phys'].nsmallest(3)

3    79
0    85
4    92
Name: phys, dtype: int32

In [100]:
df['phys'].sort_values().head(3)

3    79
0    85
4    92
Name: phys, dtype: int32

4.75 DataFrame 类的 nlargest 方法

In [101]:
df.nlargest(3, 'eng')

Unnamed: 0,math,eng,chem,phys
2,95,99,86,99
0,79,99,74,85
3,87,96,78,79
