# pandas数据处理

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
# 先了解数据
# df.head()
# df.info()
# df.desciption()

In [None]:
# 原始数据最基本的操作一定包括如下三步
# 1. 空值的处理
# 2. 重复值的处理
# 3. 异常值的处理

## 1、删除重复元素

使用duplicated()函数检测重复的行，返回元素为布尔类型的Series对象，每个元素对应一行，如果该行不是第一次出现，则元素为True

- 使用drop_duplicates()函数删除重复的行

- 使用duplicated()函数查看重复的行

In [2]:
df = DataFrame(data={
    "name":["lucy","tom","jack","tony","mery","rose","black"],
    "python":np.random.randint(0,100,size=(7)),
    "java":np.random.randint(0,100,size=7),
    "php":np.random.randint(0,100,size=7)
})
df

Unnamed: 0,java,name,php,python
0,77,lucy,39,67
1,22,tom,89,84
2,22,jack,18,24
3,23,tony,13,5
4,53,mery,27,3
5,68,rose,92,56
6,58,black,3,3


In [5]:
df.loc[7] = df.loc[0].copy()
df.loc[4] = df.loc[0].copy()
df

Unnamed: 0,java,name,php,python
0,77,lucy,39,67
1,22,tom,89,84
2,22,jack,18,24
3,23,tony,13,5
4,77,lucy,39,67
5,68,rose,92,56
6,58,black,3,3
7,77,lucy,39,67


In [10]:
# 查询重复行
# 次函数并不存在axis参数，意味着重复值只在行内查找
df.loc[df.duplicated(keep='last')]

Unnamed: 0,java,name,php,python
0,77,lucy,39,67
4,77,lucy,39,67


In [None]:
df.drop([0,4])

In [12]:
df.drop_duplicates(keep='last')

Unnamed: 0,java,name,php,python
1,22,tom,89,84
2,22,jack,18,24
3,23,tony,13,5
5,68,rose,92,56
6,58,black,3,3
7,77,lucy,39,67


In [15]:
df.loc[4,"name"] = "mery"
df.loc[6,"name"] = "tom"
df.loc[7,"name"] = "black"
df

Unnamed: 0,java,name,php,python
0,77,lucy,39,67
1,22,tom,89,84
2,22,jack,18,24
3,23,tony,13,5
4,77,mery,39,67
5,68,rose,92,56
6,58,tom,3,3
7,77,black,39,67


In [17]:
# 查询python、java、php成绩相同的行
df.loc[df.duplicated(subset=["python","java","php"])]

Unnamed: 0,java,name,php,python
4,77,mery,39,67
7,77,black,39,67


In [18]:
# 查询名字重复的数据
df.loc[df.duplicated(subset=["name"])]

Unnamed: 0,java,name,php,python
6,58,tom,3,3


## 2. 映射

In [None]:
fillna()

In [None]:
replace属于fillna的高级版本

映射的含义：创建一个映射关系列表，把values元素和一个特定的标签或者字符串绑定

包含三种操作：

- replace()函数：替换元素（DataFrame\Series的函数)
- 最重要：map()函数：新建一列(Series的函数)
- rename()函数：替换索引(DataFrame的函数)

In [20]:
# 直接替换字符串
df.replace(to_replace='tom', value='TOM')

Unnamed: 0,java,name,php,python
0,77,lucy,39,67
1,22,TOM,89,84
2,22,jack,18,24
3,23,tony,13,5
4,77,mery,39,67
5,68,rose,92,56
6,58,TOM,3,3
7,77,black,39,67


In [21]:
# 替换数字
df.replace(to_replace=77, value=100)

Unnamed: 0,java,name,php,python
0,100,lucy,39,67
1,22,tom,89,84
2,22,jack,18,24
3,23,tony,13,5
4,100,mery,39,67
5,68,rose,92,56
6,58,tom,3,3
7,100,black,39,67


In [22]:
# 使用列表替换
df.replace(to_replace=["lucy","tom","jack"], value=["LUCY","MERY","JACK"])

Unnamed: 0,java,name,php,python
0,77,LUCY,39,67
1,22,MERY,89,84
2,22,JACK,18,24
3,23,tony,13,5
4,77,mery,39,67
5,68,rose,92,56
6,58,MERY,3,3
7,77,black,39,67


In [23]:
# 使用字典替换
map_dic = {
    "lucy":"LUCY",
    "mery":"MERY",
    "tom":"TOM"
}
df.replace(to_replace=map_dic)

Unnamed: 0,java,name,php,python
0,77,LUCY,39,67
1,22,TOM,89,84
2,22,jack,18,24
3,23,tony,13,5
4,77,MERY,39,67
5,68,rose,92,56
6,58,TOM,3,3
7,77,black,39,67


In [24]:
df["oldname"] = df.name

In [25]:
df

Unnamed: 0,java,name,php,python,oldname
0,77,lucy,39,67,lucy
1,22,tom,89,84,tom
2,22,jack,18,24,jack
3,23,tony,13,5,tony
4,77,mery,39,67,mery
5,68,rose,92,56,rose
6,58,tom,3,3,tom
7,77,black,39,67,black


In [26]:
df.replace(to_replace="lucy", value="LUCY")

Unnamed: 0,java,name,php,python,oldname
0,77,LUCY,39,67,LUCY
1,22,tom,89,84,tom
2,22,jack,18,24,jack
3,23,tony,13,5,tony
4,77,mery,39,67,mery
5,68,rose,92,56,rose
6,58,tom,3,3,tom
7,77,black,39,67,black


In [27]:
# 使用字典处理某一列
df.replace(to_replace={"name":"lucy"}, value="LUCY")

Unnamed: 0,java,name,php,python,oldname
0,77,LUCY,39,67,lucy
1,22,tom,89,84,tom
2,22,jack,18,24,jack
3,23,tony,13,5,tony
4,77,mery,39,67,mery
5,68,rose,92,56,rose
6,58,tom,3,3,tom
7,77,black,39,67,black


In [29]:
# 正则替换
# to_replace 使用正则表达式
# regex 必须设置为True
df.replace(to_replace=r't.*', regex=True, value="ContainT")

Unnamed: 0,java,name,php,python,oldname
0,77,lucy,39,67,lucy
1,22,ContainT,89,84,ContainT
2,22,jack,18,24,jack
3,23,ContainT,13,5,ContainT
4,77,mery,39,67,mery
5,68,rose,92,56,rose
6,58,ContainT,3,3,ContainT
7,77,black,39,67,black


### 1) replace()函数：替换元素

使用replace()函数，对values进行替换操作

#### Series替换操作

- 单值替换
    - 普通替换
    - 字典替换
- 多值替换
    - 列表替换
    - 字典替换（推荐）

Series参数说明：

- method：对指定的值使用相邻的值填充
- limit：设定填充次数

In [31]:
name = df.name
name

0     lucy
1      tom
2     jack
3     tony
4     mery
5     rose
6      tom
7    black
Name: name, dtype: object

In [32]:
name.replace(method='ffill', to_replace='tom')

0     lucy
1     lucy
2     jack
3     tony
4     mery
5     rose
6     rose
7    black
Name: name, dtype: object

In [36]:
# method不能再DataFrame当中使用
df.replace(method='ffill', to_replace='tom', axis=1)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-36-6f7088ec44a8>", line 1, in <module>
    df.replace(method='ffill', to_replace='tom', axis=1)
  File "C:\Anaconda3\lib\site-packages\pandas\core\generic.py", line 4494, in replace
    limit)
  File "C:\Anaconda3\lib\site-packages\pandas\core\generic.py", line 76, in _single_replace
    .format(to_replace, method, type(self).__name__))
TypeError: cannot replace ['tom'] with method ffill on a DataFrame

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1828, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'TypeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

TypeError: cannot replace ['tom'] with method ffill on a DataFrame

#### DataFrame替换操作

- 单值替换
    - 普通替换
    - 按列指定单值替换{列标签：替换值}
    
    
- 多值替换
    - 列表替换
    - 单字典替换（推荐）

**注意**：DataFrame中，无法使用method和limit参数

============================================

练习19：

    假设张三李四的课表里有满分的情况，老师认为是作弊，把所有满分的情况（包括150,300分）都记0分，如何实现？
    
    name
    python
    java
    C
============================================

In [38]:
df.drop("oldname", axis=1, inplace=True)

In [41]:
df.replace(to_replace=[22,77], value=[100,100])

Unnamed: 0,java,name,php,python
0,100,lucy,39,67
1,100,tom,89,84
2,100,jack,18,24
3,23,tony,13,5
4,100,mery,39,67
5,68,rose,92,56
6,58,tom,3,3
7,100,black,39,67


In [42]:
df.replace(to_replace={
    22:110,
    77:110
})

Unnamed: 0,java,name,php,python
0,110,lucy,39,67
1,110,tom,89,84
2,110,jack,18,24
3,23,tony,13,5
4,110,mery,39,67
5,68,rose,92,56
6,58,tom,3,3
7,110,black,39,67


### 2) map()函数：新建一列

In [None]:
map 是Series的函数，所以通常被用来对某一列进行整体的映射处理


- map()可以使用字典映射新一列数据
- map()中可以使用lambd表达式
- map()中可以使用方法，可以是自定义的方法


**注意** 
- map()中不能使用sum之类的函数，for循环
- map(字典) 字典的键要足以匹配所有的数据，否则出现NaN

In [48]:
df.name.values

array(['lucy', 'tom', 'jack', 'tony', 'mery', 'rose', 'lilei', 'black'],
      dtype=object)

In [46]:
df.loc[6, "name"] = "lilei"
df

Unnamed: 0,java,name,php,python
0,77,lucy,39,67
1,22,tom,89,84
2,22,jack,18,24
3,23,tony,13,5
4,77,mery,39,67
5,68,rose,92,56
6,58,lilei,3,3
7,77,black,39,67


In [61]:
# 使用字典map
map_dic = {
    'lucy':"北京", 
    'tom':"上海", 
    'jack':"北京", 
    'tony':"上海", 
    'mery':"上海", 
    'rose':"北京"
}

In [62]:
df["address"] = df.name.map(map_dic)

In [63]:
df

Unnamed: 0,java,name,php,python,address
0,77,lucy,39,67,北京
1,22,tom,89,84,上海
2,22,jack,18,24,北京
3,23,tony,13,5,上海
4,77,mery,39,67,上海
5,68,rose,92,56,北京
6,58,lilei,3,3,
7,77,black,39,67,


In [70]:
def map_name(name):
    return map_dic.get(name,name)

In [73]:
map_name("dancer")

'dancer'

In [76]:
df["address"] = df["name"].map(map_name)

In [77]:
df

Unnamed: 0,java,name,php,python,address
0,77,lucy,39,67,北京
1,22,tom,89,84,上海
2,22,jack,18,24,北京
3,23,tony,13,5,上海
4,77,mery,39,67,上海
5,68,rose,92,56,北京
6,58,lilei,3,3,lilei
7,77,black,39,67,black


In [78]:
df

Unnamed: 0,java,name,php,python,address
0,77,lucy,39,67,北京
1,22,tom,89,84,上海
2,22,jack,18,24,北京
3,23,tony,13,5,上海
4,77,mery,39,67,上海
5,68,rose,92,56,北京
6,58,lilei,3,3,lilei
7,77,black,39,67,black


In [79]:
# 使用函数map
def score_5(score):
    if score > 90:
        return "A"
    elif score > 80:
        return "B"
    elif score > 70:
        return "C"
    elif score >= 60:
        return "D"
    else:
        return "E"

In [82]:
df.java = df.java.map(score_5)

In [84]:
df["php_5"] = df.php.map(score_5)

In [85]:
df

Unnamed: 0,java,name,php,python,address,php_5
0,C,lucy,39,67,北京,E
1,E,tom,89,84,上海,B
2,E,jack,18,24,北京,E
3,E,tony,13,5,上海,E
4,C,mery,39,67,上海,E
5,D,rose,92,56,北京,A
6,E,lilei,3,3,lilei,E
7,C,black,39,67,black,E


In [88]:
# lambda 表达式
df.name = df.name.map(lambda x: x + '1班')

In [89]:
df

Unnamed: 0,java,name,php,python,address,php_5
0,C,lucy1班,39,67,北京,E
1,E,tom1班,89,84,上海,B
2,E,jack1班,18,24,北京,E
3,E,tony1班,13,5,上海,E
4,C,mery1班,39,67,上海,E
5,D,rose1班,92,56,北京,A
6,E,lilei1班,3,3,lilei,E
7,C,black1班,39,67,black,E


### 3)transform()和map()类似

In [90]:
df.name.transform(lambda x: x + '学员')

0     lucy1班学员
1      tom1班学员
2     jack1班学员
3     tony1班学员
4     mery1班学员
5     rose1班学员
6    lilei1班学员
7    black1班学员
Name: name, dtype: object

============================================

练习20：

    新增两列，分别为张三、李四的成绩状态，如果分数低于90，则为"failed"，如果分数高于120，则为"excellent"，其他则为"pass"
    
    【提示】使用函数作为map的参数

============================================

In [92]:
score = DataFrame(data={
    "name":["张三","李四"],
    "语文":np.random.randint(50,150,size=2),
    "数学":np.random.randint(50,150,size=2),
    "英语":np.random.randint(50,150,size=2)
})
score

Unnamed: 0,name,数学,英语,语文
0,张三,104,87,58
1,李四,148,65,106


In [93]:
def score_map(score):
    if score > 120:
        return "excellent"
    elif score < 90:
        return "failed"
    else:
        return "passed"

In [94]:
columns = ["语文","数学","英语"]
for column in columns:
    score[column] = score[column].map(score_map)

In [95]:
score

Unnamed: 0,name,数学,英语,语文
0,张三,passed,failed,failed
1,李四,excellent,failed,passed


In [101]:
# 对一个DataFrame做遍历的时候，默认就是遍历它的列标签
# 完全可以把一个DataFrame当成字典来遍历
for column, v in score.items():
    print(column, v)

name 0    张三
1    李四
Name: name, dtype: object
数学 0       passed
1    excellent
Name: 数学, dtype: object
英语 0    failed
1    failed
Name: 英语, dtype: object
语文 0    failed
1    passed
Name: 语文, dtype: object


### 3) rename()函数：替换索引

仍然是新建一个字典

使用rename()函数替换行索引

- mapper 替换所有索引
- index 替换行索引
- columns 替换列索引
- level 指定多维索引的维度

In [103]:
score.rename(columns={"name":"姓名"})

Unnamed: 0,姓名,数学,英语,语文
0,张三,passed,failed,failed
1,李四,excellent,failed,passed


In [105]:
score.set_index("name", inplace=True)

In [106]:
score

Unnamed: 0_level_0,数学,英语,语文
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
张三,passed,failed,failed
李四,excellent,failed,passed


In [107]:
score.rename(index={"张三":"tom", "李四":"jack"})

Unnamed: 0_level_0,数学,英语,语文
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tom,passed,failed,failed
jack,excellent,failed,passed


In [115]:
mapper = {
    "张三":"tom",
    "李四":"jack",
    "语文":"文学",
    "英语":"外语",
    "上学期":"FIRSTCLASS",
    "下学期":"SECONDCLASS"
}

In [110]:
score.rename(mapper=mapper, axis=1)

Unnamed: 0_level_0,数学,外语,文学
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
张三,passed,failed,failed
李四,excellent,failed,passed


In [113]:
total = pd.concat((score, score), axis=1, keys=["上学期","下学期"])
total

Unnamed: 0_level_0,上学期,上学期,上学期,下学期,下学期,下学期
Unnamed: 0_level_1,数学,英语,语文,数学,英语,语文
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
张三,passed,failed,failed,passed,failed,failed
李四,excellent,failed,passed,excellent,failed,passed


In [118]:
# level 设置多层级索引的级别
total.rename(mapper=mapper, axis=1, level=-2)

Unnamed: 0_level_0,FIRSTCLASS,FIRSTCLASS,FIRSTCLASS,SECONDCLASS,SECONDCLASS,SECONDCLASS
Unnamed: 0_level_1,数学,英语,语文,数学,英语,语文
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
张三,passed,failed,failed,passed,failed,failed
李四,excellent,failed,passed,excellent,failed,passed


In [120]:
score.columns = ["math","chinese","english"]

In [121]:
score

Unnamed: 0_level_0,math,chinese,english
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
张三,passed,failed,failed
李四,excellent,failed,passed


1. replace 属于 fillna 强化版，解决值的替换
    字符串、数值、列表、字典、正则
2. map  是Series对象的操作方法，只解决一列的映射
    字典、函数、lambda表达式
3. rename 解决的索引的替换
    字典、函数

## 3. 使用聚合操作对数据异常值检测和过滤

使用describe()函数查看每一列的描述性统计量

In [125]:
df.dtypes

java       object
name       object
php         int64
python      int64
address    object
php_5      object
dtype: object

In [126]:
df.head()

Unnamed: 0,java,name,php,python,address,php_5
0,C,lucy1班,39,67,北京,E
1,E,tom1班,89,84,上海,B
2,E,jack1班,18,24,北京,E
3,E,tony1班,13,5,上海,E
4,C,mery1班,39,67,上海,E


In [123]:
# 只对可以运算的列有效
df.describe()

Unnamed: 0,php,python
count,8.0,8.0
mean,41.5,46.625
std,33.105891,31.345711
min,3.0,3.0
25%,16.75,19.25
50%,39.0,61.5
75%,51.5,67.0
max,92.0,84.0


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 0 to 7
Data columns (total 6 columns):
java       8 non-null object
name       8 non-null object
php        8 non-null int64
python     8 non-null int64
address    8 non-null object
php_5      8 non-null object
dtypes: int64(2), object(4)
memory usage: 768.0+ bytes


使用std()函数可以求得DataFrame对象每一列的标准差

In [None]:
# 尊重业务的需求  5000  10000

# 异常值通用的界定办法：如果数据是呈标准正态分布的， |data| > 3*|data.std()|
# 离群点的检测： 数值型的数据都可以使用离群点的方式来检测异常

获取异常数据的索引

In [129]:
data = np.random.randn(1000)

In [132]:
3*data.std()

3.1003800479879216

In [134]:
data[np.abs(data) > 3*data.std()]

array([ 3.45384069, -4.22036394, -3.38096751])

In [135]:
df = DataFrame(data=np.random.randn(1000,3), columns=list("ABC"))
df

Unnamed: 0,A,B,C
0,0.801081,0.739051,0.821858
1,-1.008089,-1.209557,1.103698
2,-1.494259,-2.074786,1.139493
3,-1.954978,-0.585663,0.234670
4,0.290052,0.896577,0.669352
5,-0.095928,1.271007,-1.496019
6,-0.622763,0.421794,0.124481
7,-0.319720,1.455401,0.013613
8,0.560389,0.240244,0.968943
9,0.243566,1.039902,1.076021


In [140]:
# 任意一行至少存在一个数的绝对值大于该数所处列的3倍标准差，即认定为满足异常值条件
condition = (np.abs(df) > 3*df.std()).any(axis=1)

In [143]:
df.drop(df.loc[condition].index)

Unnamed: 0,A,B,C
0,0.801081,0.739051,0.821858
1,-1.008089,-1.209557,1.103698
2,-1.494259,-2.074786,1.139493
3,-1.954978,-0.585663,0.234670
4,0.290052,0.896577,0.669352
5,-0.095928,1.271007,-1.496019
6,-0.622763,0.421794,0.124481
7,-0.319720,1.455401,0.013613
8,0.560389,0.240244,0.968943
9,0.243566,1.039902,1.076021


根据每一列或行的标准差，对DataFrame元素进行过滤。

借助any()或all()函数, 测试是否有True，有一个或以上返回True，反之返回False

对每一列应用筛选条件,去除标准差太大的数据

删除特定索引df.drop(labels,inplace = True)

============================================

练习21：

    新建一个形状为10000*3的标准正态分布的DataFrame(np.random.randn)，去除掉所有满足以下情况的行：其中任一元素绝对值大于3倍标准差

============================================

## 4. 排序

#### 使用.take()函数排序

    - take()函数接受一个索引列表，用数字表示
    - eg:df.take([1,3,4,2,5])

可以借助np.random.permutation()函数随机排序

In [144]:
df = DataFrame(data=np.random.randint(0,100,size=(5,5)), columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
0,8,17,6,56,82
1,67,58,98,35,58
2,50,50,87,29,27
3,4,11,13,0,92
4,16,43,18,56,31


In [146]:
# 类似于numpy的列表访问
# take接受的是一个索引列表，不是标签列表
df.take([0,1,0,1], axis=1)

Unnamed: 0,A,B,A.1,B.1
0,8,17,8,17
1,67,58,67,58
2,50,50,50,50
3,4,11,4,11
4,16,43,16,43


#### 随机抽样

当DataFrame规模足够大时，直接使用np.random.randint()函数，就配合take()函数实现随机抽样

In [148]:
df.take(np.random.permutation(5))

Unnamed: 0,A,B,C,D,E
3,4,11,13,0,92
4,16,43,18,56,31
2,50,50,87,29,27
0,8,17,6,56,82
1,67,58,98,35,58


In [149]:
df = DataFrame(data=np.random.randint(0,100,size=(1000,3)), columns=list("ABC"))
df

Unnamed: 0,A,B,C
0,8,86,51
1,8,39,10
2,23,8,35
3,92,87,36
4,6,90,30
5,58,37,96
6,46,48,70
7,52,19,21
8,53,31,80
9,9,52,47


In [151]:
df.take(np.random.randint(0,1000,size=3))

Unnamed: 0,A,B,C
5,58,37,96
152,31,85,85
340,4,63,80


============================================


练习22：

   假设有张三李四王老五的期中考试成绩ddd2，对着三名同学随机排序

============================================

## 5. 数据分类/组处理【重点】

In [153]:
heros = pd.read_excel('heros.xlsx')

In [155]:
heros.head()

Unnamed: 0,id,name,hp_max,hp_growth,hp_start,mp_max,mp_growth,mp_start,attack_max,attack_growth,...,hp_5s_growth,hp_5s_start,mp_5s_max,mp_5s_growth,mp_5s_start,attack_speed_max,attack_range,role_main,role_assist,birthdate
0,10000,夏侯惇,7350,288.8,3307,1746,94,430,321,11.57,...,3.357,51,37,1.571,15,0,近战,坦克,战士,2016-07-19
1,10001,钟无艳,7000,275.0,3150,1760,95,430,318,11.0,...,3.143,48,37,1.571,15,0,近战,战士,坦克,NaT
2,10002,张飞,8341,329.4,3450,100,0,100,301,10.57,...,4.143,57,5,0.0,5,0,近战,坦克,辅助,NaT
3,10003,牛魔,8476,352.8,3537,1926,104,470,273,8.775,...,4.214,58,42,1.786,17,0,近战,坦克,辅助,2015-11-24
4,10004,吕布,7344,270.0,3564,0,0,0,343,12.36,...,3.071,54,0,0.0,0,0,近战,战士,坦克,2015-12-22


In [157]:
# 根据attack_range分组，产生一个分组对象
grouped_obj = heros.groupby("attack_range")

In [158]:
# 查看分组对象的信息
grouped_obj.groups

{'近战': Int64Index([ 0,  1,  2,  3,  4,  5,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
             18, 30, 37, 38, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60,
             61, 62, 63, 64, 65, 66, 67],
            dtype='int64'),
 '远程': Int64Index([ 6, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
             36, 39, 40, 41, 42, 43, 44, 45, 49, 58, 68],
            dtype='int64')}

In [163]:
# 分组之后一定是聚合, 聚合运算只保留可运算的列
grouped_obj.mean()[["hp_growth","hp_max"]]

Unnamed: 0_level_0,hp_growth,hp_max
attack_range,Unnamed: 1_level_1,Unnamed: 2_level_1
近战,272.12439,7078.317073
远程,196.860714,5851.5


In [165]:
grouped_obj["hp_growth"].mean()

attack_range
近战    272.124390
远程    196.860714
Name: hp_growth, dtype: float64

In [164]:
# 分别对不同的列进行不同的聚合运算
grouped_obj.agg({"hp_growth":'mean', "hp_max":'max'})

Unnamed: 0_level_0,hp_growth,hp_max
attack_range,Unnamed: 1_level_1,Unnamed: 2_level_1
近战,272.12439,9328
远程,196.860714,6900


In [167]:
h1 = heros[["name","hp_max","mp_max","attack_range","role_main"]].copy()

In [172]:
avg = DataFrame(h1.groupby("attack_range")["hp_max"].mean())
avg

Unnamed: 0_level_0,hp_max
attack_range,Unnamed: 1_level_1
近战,7078.317073
远程,5851.5


In [174]:
pd.merge(h1, avg, left_on="attack_range", right_index=True, suffixes=["", "_avg"])

Unnamed: 0,name,hp_max,mp_max,attack_range,role_main,hp_max_avg
0,夏侯惇,7350,1746,近战,坦克,7078.317073
1,钟无艳,7000,1760,近战,战士,7078.317073
2,张飞,8341,100,近战,坦克,7078.317073
3,牛魔,8476,1926,近战,坦克,7078.317073
4,吕布,7344,0,近战,战士,7078.317073
5,亚瑟,8050,0,近战,战士,7078.317073
7,程咬金,8611,0,近战,坦克,7078.317073
8,廉颇,9328,1708,近战,坦克,7078.317073
9,东皇太一,7669,1926,近战,坦克,7078.317073
10,庄周,8149,1694,近战,辅助,7078.317073


In [177]:
# 多字段分组
grouped_obj1 = h1.groupby(["attack_range","role_main"])
grouped_obj1.groups

{('近战', '刺客'): Int64Index([52, 53, 54, 64, 65, 66], dtype='int64'),
 ('近战', '坦克'): Int64Index([0, 2, 3, 7, 8, 9, 12, 14, 15, 17], dtype='int64'),
 ('近战',
  '战士'): Int64Index([1, 4, 5, 13, 18, 30, 50, 51, 55, 56, 57, 59, 60, 61, 62, 63, 67], dtype='int64'),
 ('近战', '法师'): Int64Index([16, 37, 38, 46], dtype='int64'),
 ('近战', '辅助'): Int64Index([10, 11, 47, 48], dtype='int64'),
 ('远程',
  '射手'): Int64Index([19, 20, 21, 22, 23, 24, 25, 26, 27, 68], dtype='int64'),
 ('远程', '战士'): Int64Index([58], dtype='int64'),
 ('远程',
  '法师'): Int64Index([6, 28, 29, 31, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 45], dtype='int64'),
 ('远程', '辅助'): Int64Index([39, 49], dtype='int64')}

In [181]:
# 这其实是一个透视表
grouped_obj1.agg({"hp_max":"mean","mp_max":"mean"}).unstack(level=-2, fill_value=0)

Unnamed: 0_level_0,hp_max,hp_max,mp_max,mp_max
attack_range,近战,远程,近战,远程
role_main,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
刺客,6090.5,0.0,1190.333333,0.0
坦克,8312.4,0.0,1440.0,0.0
射手,0.0,5798.5,0.0,1611.6
战士,6939.588235,6900.0,1042.176471,1742.0
法师,6270.25,5749.066667,1467.5,1857.466667
辅助,6872.5,6360.5,1777.0,1912.0


In [188]:
grouped_obj1.agg({"hp_max":"mean","mp_max":"mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,hp_max,mp_max
attack_range,role_main,Unnamed: 2_level_1,Unnamed: 3_level_1
近战,刺客,6090.5,1190.333333
近战,坦克,8312.4,1440.0
近战,战士,6939.588235,1042.176471
近战,法师,6270.25,1467.5
近战,辅助,6872.5,1777.0
远程,射手,5798.5,1611.6
远程,战士,6900.0,1742.0
远程,法师,5749.066667,1857.466667
远程,辅助,6360.5,1912.0


数据聚合是数据处理的最后一步，通常是要使每一个数组生成一个单一的数值。

数据分类处理：

 - 分组：先把数据分为几组
 - 用函数处理：为不同组的数据应用不同的函数以转换数据
 - 合并：把不同组得到的结果合并起来
 
数据分类处理的核心：
     - groupby()函数
     - groups属性查看分组情况

- 根据item分组,查看结果

总结：数据类型是离散的可以分组，连续的没有意义

- 获取weight的总和

- 把总和跟df进行merge合并

- 使用列表进行多列分组，得到的结果是多层级索引

============================================

练习23：

   假设菜市场张大妈在卖菜，有以下属性：
   
   >**菜品(item)：**萝卜，白菜，辣椒，冬瓜
   
   >**颜色(color)：**白，青，红
   
   >**重量(weight)**
   
   >**价格(price)**
   

1. 以属性为列索引，创建一个DataFrame对象df   
2. 对df进行聚合操作，求出颜色为白色的价格总和
3. 对df进行聚合操作，求出萝卜的所有重量(包括白萝卜，胡萝卜，青萝卜）以及平均价格
4. 使用merge合并总重量及平均价格

============================================

In [None]:
2. df.groupby("color")["price"].sum()
3. r = df.groupby("item").agg({"weight":"sum","price":"mean"})
4. pd.merge(df, r, left_on="item",right_index=True)

## 6.0 高级数据聚合

#### 使用groupby分组后，也可以使用transform和apply提供自定义函数实现更多的运算

 - df.groupby('item')['price'].sum() <==> df.groupby('item')['price'].apply(sum)
 - transform和apply都会进行运算，在transform或者apply中传入函数即可
 - transform和apply也可以传入一个lambda表达式

In [190]:
h1.groupby("role_main")["mp_max"].mean()

role_main
刺客    1190.333333
坦克    1440.000000
射手    1611.600000
战士    1081.055556
法师    1775.368421
辅助    1822.000000
Name: mp_max, dtype: float64

In [192]:
# 如果不是官方聚合函数，可以使用apply传递
h1.groupby("role_main")["mp_max"].apply(np.mean)

role_main
刺客    1190.333333
坦克    1440.000000
射手    1611.600000
战士    1081.055556
法师    1775.368421
辅助    1822.000000
Name: mp_max, dtype: float64

In [204]:
# 分组对象的聚合函数，接受的是每一个分组
# 定制一个最大值和平均值的差的聚合函数
def group_function(x):
    return x.max() - x.mean()

In [212]:
h1.head()

Unnamed: 0,name,hp_max,mp_max,attack_range,role_main
0,夏侯惇,7350,1746,近战,坦克
1,钟无艳,7000,1760,近战,战士
2,张飞,8341,100,近战,坦克
3,牛魔,8476,1926,近战,坦克
4,吕布,7344,0,近战,战士


In [211]:
df = DataFrame(h1.groupby("role_main")["mp_max"].apply(group_function))
df.columns = ["max-mean"]
df

Unnamed: 0_level_0,max-mean
role_main,Unnamed: 1_level_1
刺客,631.666667
坦克,500.0
射手,172.4
战士,754.944444
法师,240.631579
辅助,104.0


In [215]:
pd.merge(h1, df, left_on="role_main",right_index=True)

Unnamed: 0,name,hp_max,mp_max,attack_range,role_main,max-mean
0,夏侯惇,7350,1746,近战,坦克,500.000000
2,张飞,8341,100,近战,坦克,500.000000
3,牛魔,8476,1926,近战,坦克,500.000000
7,程咬金,8611,0,近战,坦克,500.000000
8,廉颇,9328,1708,近战,坦克,500.000000
9,东皇太一,7669,1926,近战,坦克,500.000000
12,白起,8638,1666,近战,坦克,500.000000
14,刘邦,8073,1940,近战,坦克,500.000000
15,刘禅,8581,1694,近战,坦克,500.000000
17,项羽,8057,1694,近战,坦克,500.000000


### 注意
- transform 会自动匹配列索引返回值，不去重
- apply 会根据分组情况返回值，去重

In [216]:
DataFrame(h1.groupby("role_main")["mp_max"].transform(group_function))

Unnamed: 0,mp_max
0,500.000000
1,754.944444
2,500.000000
3,500.000000
4,754.944444
5,754.944444
6,240.631579
7,500.000000
8,500.000000
9,500.000000


============================================

练习24：

   使用transform与apply实现练习23的功能

============================================

### 交叉表

交叉表(cross-tabulation, 简称crosstab)是一种用于计算分组频率的特殊透视表。

In [187]:
pd.crosstab(index=h1["role_main"], columns=h1["attack_range"])

attack_range,近战,远程
role_main,Unnamed: 1_level_1,Unnamed: 2_level_1
刺客,6,0
坦克,10,0
射手,0,10
战士,17,1
法师,4,15
辅助,4,2


### 透视表

透视表(pivot table)是各种电子表格程序和其他数据分析软件中一种常见的数据汇总工具。它根据一个或多个键对数据进行聚合，并根据行和列上得分组建将数据分配到各个矩形区域中。在Python和pandas中，可以通过本章所介绍的groupby功能以及（能够利用层次化索引的）DataFrame有一个pivot_table方法，此外还有一个顶级的pandas.pivot_table函数。除了能为groupby提供便利之外，pivot_table还可以添加分项小计（也叫margins）。

In [185]:
# 透视表
pd.pivot_table(data=h1, values="hp_max", index="role_main", columns="attack_range",aggfunc="mean", fill_value=0)

attack_range,近战,远程
role_main,Unnamed: 1_level_1,Unnamed: 2_level_1
刺客,6090.5,0.0
坦克,8312.4,0.0
射手,0.0,5798.5
战士,6939.588235,6900.0
法师,6270.25,5749.066667
辅助,6872.5,6360.5
