In [1]:
import numpy as np
import pandas as pd

# 创建


## 自动索引

只指定 data,不指定 index,index 会自动递增


In [48]:
pd.DataFrame(data={"values": [5, 10, 15, 20, 25]})

Unnamed: 0,values
0,5
1,10
2,15
3,20
4,25


## 通过二维数组


In [49]:
df1 = pd.DataFrame(
    data=np.random.randint(60, 101, (10, 3)),
    columns=["语文", "数学", "英语"],
    index=[1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010],
)
df1

Unnamed: 0,语文,数学,英语
1001,63,67,63
1002,67,98,72
1003,76,95,99
1004,73,82,90
1005,66,91,80
1006,99,85,69
1007,82,98,73
1008,100,75,91
1009,94,92,63
1010,90,83,96


In [50]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 1001 to 1010
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   语文      10 non-null     int32
 1   数学      10 non-null     int32
 2   英语      10 non-null     int32
dtypes: int32(3)
memory usage: 200.0 bytes


## 通过 dict


In [51]:
dict_scores_data = {
    "语文": [62, 72, 93, 88, 93],
    "数学": [95, 65, 86, 66, 87],
    "英语": [66, 75, 82, 69, 82],
}
df2 = pd.DataFrame(data=dict_scores_data, index=[1011, 1012, 1013, 1014, 1015])
df2

Unnamed: 0,语文,数学,英语
1011,62,95,66
1012,72,65,75
1013,93,86,82
1014,88,66,69
1015,93,87,82


# 设限

限制整个 dataframe 中的最小值和最大值


In [52]:
limit_df = pd.DataFrame({"values": [5, 10, 15, 20, 25]})
limit_df

Unnamed: 0,values
0,5
1,10
2,15
3,20
4,25


In [53]:
limit_df["values"] = limit_df["values"].clip(lower=10, upper=20)
limit_df

Unnamed: 0,values
0,10
1,10
2,15
3,20
4,20


# 拼接/合并 merge


In [54]:
d1 = pd.DataFrame(
    data={"values": [6, 10, 15, 20, 25], "语文": np.random.randint(60, 90, (5))}
)
d2 = pd.DataFrame(
    data={"values": [1, 2, 3, 4, 5], "语文": np.random.randint(60, 90, (5))}
)
print(d1)
print(d2)

   values  语文
0       6  63
1      10  80
2      15  62
3      20  87
4      25  87
   values  语文
0       1  70
1       2  61
2       3  69
3       4  83
4       5  65


## concat

In [55]:
pd.concat([d1, d2])

Unnamed: 0,values,语文
0,6,63
1,10,80
2,15,62
3,20,87
4,25,87
0,1,70
1,2,61
2,3,69
3,4,83
4,5,65


## merge

In [56]:
pd.merge(d1, d2, how="left")

Unnamed: 0,values,语文
0,6,63
1,10,80
2,15,62
3,20,87
4,25,87


In [57]:
pd.merge(d1, d2, how="right")

Unnamed: 0,values,语文
0,1,70
1,2,61
2,3,69
3,4,83
4,5,65


In [58]:
pd.merge(d1, d2, how="inner")

Unnamed: 0,values,语文


In [59]:
pd.merge(d1, d2, how="left", on="语文")

Unnamed: 0,values_x,语文,values_y
0,6,63,
1,10,80,
2,15,62,
3,20,87,
4,25,87,


# 结合 combine

## combine_first

a.combine_first(b)

把a中为空的元素替换b的元素，被替换的列需要索引一致

### 空值替换

In [6]:
df = pd.DataFrame(
    {"A": ["001", None, "003", None, "005"], "B": ["1", "2", "3", "4", "5"]}
)
df

Unnamed: 0,A,B
0,1.0,1
1,,2
2,3.0,3
3,,4
4,5.0,5


In [7]:
df["A"].combine_first(df["B"])

0    001
1      2
2    003
3      4
4    005
Name: A, dtype: object

### 结合时，索引需要一致

In [15]:
s1 = pd.Series(["001", None, None], index=["a", "b", "c"])
s2 = pd.Series(["2", "4"], index=["b", "e"])

In [16]:
s1

a     001
b    None
c    None
dtype: object

In [17]:
s2

b    2
e    4
dtype: object

In [18]:
s1.combine_first(s2)

a    001
b      2
c    NaN
e      4
dtype: object

## combine

combine和combine_first类似，但需要合并函数

In [26]:
df = pd.DataFrame(
    {"A": ["001", None, "003", None, "005"], "B": ["1", "2", "3", "4", "5"]}
)
df

Unnamed: 0,A,B
0,1.0,1
1,,2
2,3.0,3
3,,4
4,5.0,5


In [30]:
# a 不为空,则替换为b
df["A"].combine(df["B"], lambda a, b: b if pd.notna(a) else a)

0       1
1    None
2       3
3    None
4       5
dtype: object

## update
a.update(b)

用b中不为空的元素替换a的元素

### 强制替换

In [34]:
s1 = pd.Series([1, 2, 3, 4])
s2 = pd.Series([11, 22, None, 44])

s1.update(s2)
s1

0    11
1    22
2     3
3    44
dtype: int64

### 合并时，并不匹配索引，只替换对应位置的数据

In [38]:
s1 = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])
s2 = pd.Series([11, 22, 33, 44], index=["c", "d", "e", "f"])

In [39]:
s1

a    1
b    2
c    3
d    4
dtype: int64

In [40]:
s2

c    11
d    22
e    33
f    44
dtype: int64

In [41]:
s1.update(s2)
print(s1)

a     1
b     2
c    11
d    22
dtype: int64


# 迭代


In [60]:
iter_df = pd.DataFrame(
    data=np.random.randint(60, 101, (10, 3)),
    columns=["语文", "数学", "英语"],
    index=np.random.randint(1, 101, size=10),
)
iter_df

Unnamed: 0,语文,数学,英语
89,88,71,86
84,70,70,92
8,80,91,88
64,96,75,97
89,93,93,88
50,84,75,94
60,84,64,99
29,87,94,99
33,94,71,88
54,92,68,86


In [61]:
for index, row in iter_df.iterrows():
    ratio_value = row["语文"]
    print(f"行 {index} {type(index)} 的 ratio 语文: {ratio_value}")

行 89 <class 'int'> 的 ratio 语文: 88
行 84 <class 'int'> 的 ratio 语文: 70
行 8 <class 'int'> 的 ratio 语文: 80
行 64 <class 'int'> 的 ratio 语文: 96
行 89 <class 'int'> 的 ratio 语文: 93
行 50 <class 'int'> 的 ratio 语文: 84
行 60 <class 'int'> 的 ratio 语文: 84
行 29 <class 'int'> 的 ratio 语文: 87
行 33 <class 'int'> 的 ratio 语文: 94
行 54 <class 'int'> 的 ratio 语文: 92


# 重命名


In [62]:
for_rename_df = pd.DataFrame(data={"value": [3, 10, 15, 20, 25]})
for_rename_df

Unnamed: 0,value
0,3
1,10
2,15
3,20
4,25


## 重命名 columns


In [63]:
for_rename_df.rename(
    columns={
        "value": "new_value",
    }
)

Unnamed: 0,new_value
0,3
1,10
2,15
3,20
4,25


## 重命名 columns index


In [64]:
for_rename_df.columns

Index(['value'], dtype='object')

In [65]:
new_column_index = for_rename_df.rename_axis("new", axis=1)
new_column_index

new,value
0,3
1,10
2,15
3,20
4,25


In [66]:
new_column_index.columns

Index(['value'], dtype='object', name='new')

# Dataframe 属性


In [67]:
df = pd.DataFrame(data={"value": [3, 10, 15, 20, 25]})

## 行数和列数


In [68]:
df.shape

(5, 1)

## 索引，数据类型和内存信息


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   value   5 non-null      int64
dtypes: int64(1)
memory usage: 172.0 bytes


# 修改数据类型

- 某些数据类型直接进行计算,需要转换成数字类型
  1.  比如 object


In [70]:
#
df = pd.DataFrame(data={"value": [3, 10, 15, 20, 25]})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   value   5 non-null      int64
dtypes: int64(1)
memory usage: 172.0 bytes


In [71]:
df.astype(float).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   value   5 non-null      float64
dtypes: float64(1)
memory usage: 172.0 bytes
