# Pandas中Dataframe的基础操作

In [1]:
import pandas as pd

## 一、Dataframe——代码世界的表格

### 1. Dataframe及其属性

#### （1）创建Dataframe

运用字典创建Dataframe

In [2]:
list_id = ["01", "02", "03", "04", "05"]
list_class = ["二班", "一班", "二班", "三班", "一班"]
list_grade = [92, 67, 70, 88, 76]

In [3]:
df1 = pd.DataFrame({"学号": list_id, "班级": list_class, "成绩": list_grade})
df1

Unnamed: 0,学号,班级,成绩
0,1,二班,92
1,2,一班,67
2,3,二班,70
3,4,三班,88
4,5,一班,76


原Series中存在标签索引

In [4]:
series_id = pd.Series(["01", "02", "03", "04", "05"], ["小明", "小红", "小杰", "小丽", "小华"])
series_class = pd.Series(["二班", "一班", "二班", "三班", "一班"], ["小明", "小红", "小杰", "小丽", "小华"])
series_grade = pd.Series([92, 67, 70, 88, 76], ["小明", "小红", "小杰", "小丽", "小华"])

In [5]:
df2 = pd.DataFrame({"学号": series_id, "班级": series_class, "成绩": series_grade})
df2

Unnamed: 0,学号,班级,成绩
小明,1,二班,92
小红,2,一班,67
小杰,3,二班,70
小丽,4,三班,88
小华,5,一班,76


运用嵌套字典创建Dataframe

In [6]:
df3 = pd.DataFrame({"学号": {"小明": "01", "小红": "02", "小杰": "03", 
                           "小丽": "04", "小华": "05"}, 
                    "班级": {"小明": "二班", "小红": "一班", "小杰": "二班", 
                           "小丽": "三班", "小华": "一班"},
                    "成绩": {"小明": 92, "小红": 67, "小杰": 70, 
                           "小丽": 88, "小华": 76}})
df3

Unnamed: 0,学号,班级,成绩
小明,1,二班,92
小红,2,一班,67
小杰,3,二班,70
小丽,4,三班,88
小华,5,一班,76


补充：对Dataframe进行转置

In [7]:
df3.T

Unnamed: 0,小明,小红,小杰,小丽,小华
学号,01,02,03,04,05
班级,二班,一班,二班,三班,一班
成绩,92,67,70,88,76


#### （2）获取索引与列名

##### 获取索引

In [8]:
df3.index

Index(['小明', '小红', '小杰', '小丽', '小华'], dtype='object')

##### 获取列名

In [9]:
df3.columns

Index(['学号', '班级', '成绩'], dtype='object')

##### 获取所有值

In [10]:
df3.values

array([['01', '二班', 92],
       ['02', '一班', 67],
       ['03', '二班', 70],
       ['04', '三班', 88],
       ['05', '一班', 76]], dtype=object)

注：返回的类型是NumPy数组，可以使用NumPy数组的相关操作

### 2. Dataframe的提取与筛选

#### （1）提取Dataframe的列

In [11]:
df3["班级"]

小明    二班
小红    一班
小杰    二班
小丽    三班
小华    一班
Name: 班级, dtype: object

注：提取出来列的类型是Series

In [12]:
df3.成绩

小明    92
小红    67
小杰    70
小丽    88
小华    76
Name: 成绩, dtype: int64

In [13]:
df3[["学号", "班级"]]

Unnamed: 0,学号,班级
小明,1,二班
小红,2,一班
小杰,3,二班
小丽,4,三班
小华,5,一班


注：提取出来的类型是Dataframe

#### （2）提取Dataframe的行

##### 运用loc和iloc提Series中的特定值

In [14]:
df3.loc["小丽"]

学号    04
班级    三班
成绩    88
Name: 小丽, dtype: object

In [15]:
df3.iloc[3]

学号    04
班级    三班
成绩    88
Name: 小丽, dtype: object

##### 运用loc和iloc进行切片

In [16]:
df3.loc["小红":"小丽"]

Unnamed: 0,学号,班级,成绩
小红,2,一班,67
小杰,3,二班,70
小丽,4,三班,88


In [17]:
df3.iloc[1:4]

Unnamed: 0,学号,班级,成绩
小红,2,一班,67
小杰,3,二班,70
小丽,4,三班,88


##### 运用loc和iloc提Series中的部分值

In [18]:
df3.loc[["小红", "小丽"]]

Unnamed: 0,学号,班级,成绩
小红,2,一班,67
小丽,4,三班,88


In [19]:
df3.iloc[[1, 3]]

Unnamed: 0,学号,班级,成绩
小红,2,一班,67
小丽,4,三班,88


##### 运用loc和iloc提Series中的部分数据

In [20]:
df3.loc["小丽", "学号"]

'04'

In [21]:
df3.iloc[3, 0]

'04'

##### 运用loc和iloc提Series中的部分切片数据

In [22]:
df3.loc["小红":"小丽", "班级":"成绩"]

Unnamed: 0,班级,成绩
小红,一班,67
小杰,二班,70
小丽,三班,88


In [23]:
df3.iloc[1:4, 1:3]

Unnamed: 0,班级,成绩
小红,一班,67
小杰,二班,70
小丽,三班,88


In [24]:
df3.loc[["小红", "小丽"], "班级":"成绩"]

Unnamed: 0,班级,成绩
小红,一班,67
小丽,三班,88


#### （3）根据条件筛选Dataframe的行

In [25]:
df3[df3["成绩"] > 80]

Unnamed: 0,学号,班级,成绩
小明,1,二班,92
小丽,4,三班,88


In [26]:
df3[df3.成绩 > 80]

Unnamed: 0,学号,班级,成绩
小明,1,二班,92
小丽,4,三班,88


In [27]:
df3[(df3.成绩 > 80) & (df3.班级 == "三班")]

Unnamed: 0,学号,班级,成绩
小丽,4,三班,88


#### （4）返回Dataframe的前n行

In [28]:
df3.head(3)

Unnamed: 0,学号,班级,成绩
小明,1,二班,92
小红,2,一班,67
小杰,3,二班,70


In [29]:
df3.head()

Unnamed: 0,学号,班级,成绩
小明,1,二班,92
小红,2,一班,67
小杰,3,二班,70
小丽,4,三班,88
小华,5,一班,76


### 3.Dataframe的操作

创建一个新的Dataframe

In [30]:
name = pd.Series(["小陈", "小李", "小王", "小张", "小赵", "小周"], index=["001", "002", "003", "004", "005", "006"])
gender = pd.Series(["女", "女", "男", "男", "女", "男"], index=["006", "005", "004", "003", "002", "001"])
height = pd.Series([172.5, 168.0, 178.2, 181.3, 161.7, 159.8], index=["001", "002", "003", "004", "005", "006"])
grade = pd.Series([89, 92, 82, 96, 93, 84], index=["001", "002", "003", "004", "005", "006"])
df4 = pd.DataFrame({"姓名": name, "性别": gender, "身高": height, "成绩": grade})
df4

Unnamed: 0,姓名,性别,身高,成绩
1,小陈,男,172.5,89
2,小李,女,168.0,92
3,小王,男,178.2,82
4,小张,男,181.3,96
5,小赵,女,161.7,93
6,小周,女,159.8,84


#### （1）更新Dataframe中的列（行）

##### 更新列

In [31]:
df4["成绩"] = pd.Series([90, 91, 83, 95, 94, 85], index=["001", "002", "003", "004", "005", "006"])
df4

Unnamed: 0,姓名,性别,身高,成绩
1,小陈,男,172.5,90
2,小李,女,168.0,91
3,小王,男,178.2,83
4,小张,男,181.3,95
5,小赵,女,161.7,94
6,小周,女,159.8,85


In [32]:
df4["成绩"] = [90, 91, 83, 95, 94, 85]
df4

Unnamed: 0,姓名,性别,身高,成绩
1,小陈,男,172.5,90
2,小李,女,168.0,91
3,小王,男,178.2,83
4,小张,男,181.3,95
5,小赵,女,161.7,94
6,小周,女,159.8,85


##### 增加列

In [33]:
df4["班级"] = ["一班", "三班", "二班", "三班", "一班", "二班"]
df4

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
5,小赵,女,161.7,94,一班
6,小周,女,159.8,85,二班


##### 更新行

In [34]:
df4.loc["005"] = pd.Series(["小赵", "女", 162.7, 95, "一班"],
                           index=["姓名", "性别", "身高", 
                                  "成绩", "班级"])
df4

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
5,小赵,女,162.7,95,一班
6,小周,女,159.8,85,二班


In [35]:
df4.loc["005"] = ["小赵", "女", 162.7, 95, "一班"]
df4

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
5,小赵,女,162.7,95,一班
6,小周,女,159.8,85,二班


In [36]:
df4.iloc[4] = pd.Series(["小赵", "女", 162.7, 95, "一班"],
                           index=["姓名", "性别", "身高", 
                                  "成绩", "班级"])
df4

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
5,小赵,女,162.7,95,一班
6,小周,女,159.8,85,二班


##### 添加行

In [37]:
df4.loc["007"] = pd.Series(["小孙", "难", 182.7, 71, "一班"],
                           index=["姓名", "性别", "身高", 
                                  "成绩", "班级"])
df4

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
5,小赵,女,162.7,95,一班
6,小周,女,159.8,85,二班
7,小孙,难,182.7,71,一班


In [38]:
df4.loc["007"] = pd.Series(["小孙", "难", 182.7, 71, "一班"],
                           index=["姓名", "性别", "身高", 
                                  "成绩", "班级"])
df4

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
5,小赵,女,162.7,95,一班
6,小周,女,159.8,85,二班
7,小孙,难,182.7,71,一班


注：添加行不能用iloc，不然会报错

#### （2）删除Dataframe中的列（行）

##### 删除列

In [39]:
df4.drop("身高", axis=1)

Unnamed: 0,姓名,性别,成绩,班级
1,小陈,男,90,一班
2,小李,女,91,三班
3,小王,男,83,二班
4,小张,男,95,三班
5,小赵,女,95,一班
6,小周,女,85,二班
7,小孙,难,71,一班


注：drop(list_or_line_name, axis=0/1)函数中，axis=0表示行，axis=1表示列

In [40]:
df4.drop(["身高", "性别"], axis=1)

Unnamed: 0,姓名,成绩,班级
1,小陈,90,一班
2,小李,91,三班
3,小王,83,二班
4,小张,95,三班
5,小赵,95,一班
6,小周,85,二班
7,小孙,71,一班


##### 删除行

In [41]:
df4.drop("005", axis=0)

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
6,小周,女,159.8,85,二班
7,小孙,难,182.7,71,一班


In [42]:
df4.drop(["003", "005"], axis=0)

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
4,小张,男,181.3,95,三班
6,小周,女,159.8,85,二班
7,小孙,难,182.7,71,一班


注：删除的Dataframe行（或列）之后返回新的Dataframe，并不会改变原始的Dataframe，可以进行赋值进行更改

#### （3）Dataframe之间的操作

Dataframe与Dataframe进行相加（减乘除）运算的时候，会跟进索引与列名进行对齐之后再操作  
只有行和列均相同，才会对其对应的值进行相加（减乘除）运算

In [43]:
df5 = pd.DataFrame({"a": {"001": 0, "003": 4, "005": 8, "007": 12},
                    "b": {"001": 1, "003": 5, "005": 9, "007": 13},
                    "c": {"001": 2, "003": 6, "005": 10, "007": 14},
                    "d": {"001": 3, "003": 7, "005": 11, "007": 15}})
df5

Unnamed: 0,a,b,c,d
1,0,1,2,3
3,4,5,6,7
5,8,9,10,11
7,12,13,14,15


In [44]:
df6 = pd.DataFrame({"a": {"001": 0, "003": 3, "006": 6},
                    "b": {"001": 1, "003": 4, "006": 7},
                    "c": {"001": 2, "003": 5, "006": 8}})
df6

Unnamed: 0,a,b,c
1,0,1,2
3,3,4,5
6,6,7,8


In [45]:
df5 + df6

Unnamed: 0,a,b,c,d
1,0.0,2.0,4.0,
3,7.0,9.0,11.0,
5,,,,
6,,,,
7,,,,


##### 运用函数对Dataframe进行运算

In [46]:
df5.add(df6, fill_value=0)

Unnamed: 0,a,b,c,d
1,0.0,2.0,4.0,3.0
3,7.0,9.0,11.0,7.0
5,8.0,9.0,10.0,11.0
6,6.0,7.0,8.0,
7,12.0,13.0,14.0,15.0


注：索引006，列为d的值还是NaN的原因是，df5中没有006索引，df6中没有d列  
索引和列都在两个Dataframe中都缺失，会导致相加后为NaN

In [47]:
df5.sub(df6, fill_value=0)

Unnamed: 0,a,b,c,d
1,0.0,0.0,0.0,3.0
3,1.0,1.0,1.0,7.0
5,8.0,9.0,10.0,11.0
6,-6.0,-7.0,-8.0,
7,12.0,13.0,14.0,15.0


In [48]:
df5.mul(df6, fill_value=0)

Unnamed: 0,a,b,c,d
1,0.0,1.0,4.0,0.0
3,12.0,20.0,30.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,
7,0.0,0.0,0.0,0.0


In [49]:
df5.div(df6, fill_value=0)

Unnamed: 0,a,b,c,d
1,,1.0,1.0,inf
3,1.333333,1.25,1.2,inf
5,inf,inf,inf,inf
6,0.0,0.0,0.0,
7,inf,inf,inf,inf


#### （4）Series与Dataframe之间的操作

Dataframe与Series进行相加（减乘除）运算的时候，  
此时会把Series中的索引和Dataframe列名进行对齐来相加（减乘除）

In [50]:
df5 = pd.DataFrame({"a": {"001": 0, "003": 4, "005": 8, "007": 12},
                    "b": {"001": 1, "003": 5, "005": 9, "007": 13},
                    "c": {"001": 2, "003": 6, "005": 10, "007": 14},
                    "d": {"001": 3, "003": 7, "005": 11, "007": 15}})
df5

Unnamed: 0,a,b,c,d
1,0,1,2,3
3,4,5,6,7
5,8,9,10,11
7,12,13,14,15


In [51]:
s1 = pd.Series([0.1, 0.2, 0.3, 0.4, 0.5], index=["a", "b", "c", "d", "e"])
s1

a    0.1
b    0.2
c    0.3
d    0.4
e    0.5
dtype: float64

In [52]:
s1 + df5

Unnamed: 0,a,b,c,d,e
1,0.1,1.2,2.3,3.4,
3,4.1,5.2,6.3,7.4,
5,8.1,9.2,10.3,11.4,
7,12.1,13.2,14.3,15.4,


In [53]:
s1 * df5

Unnamed: 0,a,b,c,d,e
1,0.0,0.2,0.6,1.2,
3,0.4,1.0,1.8,2.8,
5,0.8,1.8,3.0,4.4,
7,1.2,2.6,4.2,6.0,


##### Dataframe中的广播机制

In [54]:
df4.loc["007"] = pd.Series(["小孙", "难", 182.7, 71, "一班"],
                           index=["姓名", "性别", "身高", 
                                  "成绩", "班级"])
df4

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈,男,172.5,90,一班
2,小李,女,168.0,91,三班
3,小王,男,178.2,83,二班
4,小张,男,181.3,95,三班
5,小赵,女,162.7,95,一班
6,小周,女,159.8,85,二班
7,小孙,难,182.7,71,一班


In [55]:
df4 * 5

Unnamed: 0,姓名,性别,身高,成绩,班级
1,小陈小陈小陈小陈小陈,男男男男男,862.5,450,一班一班一班一班一班
2,小李小李小李小李小李,女女女女女,840.0,455,三班三班三班三班三班
3,小王小王小王小王小王,男男男男男,891.0,415,二班二班二班二班二班
4,小张小张小张小张小张,男男男男男,906.5,475,三班三班三班三班三班
5,小赵小赵小赵小赵小赵,女女女女女,813.5,475,一班一班一班一班一班
6,小周小周小周小周小周,女女女女女,799.0,425,二班二班二班二班二班
7,小孙小孙小孙小孙小孙,难难难难难,913.5,355,一班一班一班一班一班


In [56]:
df4["成绩"] *0.5

001    45.0
002    45.5
003    41.5
004    47.5
005    47.5
006    42.5
007    35.5
Name: 成绩, dtype: float64

#### （5）Dataframe的统计运算

In [57]:
player1 = pd.Series([8.5, 7.9, 8.2, 7.6], index=["短跑", "跳高", "游泳", "自行车"])
player2 = pd.Series([9.0, 8.3, 8.6, 7.7], index=["短跑", "跳高", "游泳", "自行车"])
player3 = pd.Series([8.7, 8.1, 8.4, 7.9], index=["短跑", "跳高", "游泳", "自行车"])
df7 = pd.DataFrame({"选手1": player1, "选手2": player2, "选手3": player3})
df7

Unnamed: 0,选手1,选手2,选手3
短跑,8.5,9.0,8.7
跳高,7.9,8.3,8.1
游泳,8.2,8.6,8.4
自行车,7.6,7.7,7.9


##### 计算各列均值

In [58]:
df7.mean()

选手1    8.050
选手2    8.400
选手3    8.275
dtype: float64

In [59]:
df7.mean(axis=0)

选手1    8.050
选手2    8.400
选手3    8.275
dtype: float64

##### 计算各行均值

In [60]:
df7.mean(axis=1)

短跑     8.733333
跳高     8.100000
游泳     8.400000
自行车    7.733333
dtype: float64

##### 描述性统计

In [61]:
df7.describe()

Unnamed: 0,选手1,选手2,选手3
count,4.0,4.0,4.0
mean,8.05,8.4,8.275
std,0.387298,0.547723,0.35
min,7.6,7.7,7.9
25%,7.825,8.15,8.05
50%,8.05,8.45,8.25
75%,8.275,8.7,8.475
max,8.5,9.0,8.7


#### （6）对Dataframe使用函数

##### apply函数

定义一个去掉最高和最低分求均值的函数：

In [62]:
def trim_mean(data):
    data_len = len(data)
    data_sum = data.sum()
    max_num = data.max()
    min_num = data.min()
    final_mean = (data_sum - max_num - min_num)/(data_len - 2)
    return final_mean

In [63]:
df7.apply(trim_mean)

选手1    8.05
选手2    8.45
选手3    8.25
dtype: float64

In [64]:
df7.apply(trim_mean, axis=1)

短跑     8.7
跳高     8.1
游泳     8.4
自行车    7.7
dtype: float64

##### applymap函数

In [65]:
df7.applymap(lambda x: x + 5)

  df7.applymap(lambda x: x + 5)


Unnamed: 0,选手1,选手2,选手3
短跑,13.5,14.0,13.7
跳高,12.9,13.3,13.1
游泳,13.2,13.6,13.4
自行车,12.6,12.7,12.9


注:apply函数与applymap函数不同之处在于：apply只作用于行（或列），applymap可以作用于Dataframe中的所有元素

#### （7）对Dataframe某列进行排序

##### sort_values方法

In [66]:
df8 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [23, 45, 25, 37]
})
df8

Unnamed: 0,Name,Age
0,Alice,23
1,Bob,45
2,Charlie,25
3,David,37


In [67]:
df8_sorted = df8.sort_values(by="Age", ascending=True)
df8_sorted

Unnamed: 0,Name,Age
0,Alice,23
2,Charlie,25
3,David,37
1,Bob,45


注意：采用sort_value方法进行排序，设置by参数指定某列，设置ascending=True确定为升序。