### 1、导入三方库

In [1]:
import numpy as np
import pandas as pd

### 2、通过Series函数创建对象

In [2]:
s = pd.Series([1, 3, 4, np.nan, 6, 8])
s

0    1.0
1    3.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64

### 3、通过DataFrame创建对象（参数为Numpy数组）

In [3]:
dates = pd.date_range("20130101", periods=6) # 创建时间列表，参数1为起始日期，参数2为列表元素数量
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) # randn()返回值为标准正态分布中的值，参数为生成的数组维度
df

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


### 4、通过DataFrame创建对象（参数为字典）

In [5]:
data_dict = {
    "A": 1.0,
    "B": pd.Timestamp("20210809"), 
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),  
    "D": np.array([3] * 4, dtype="int32"),                     
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "Foo",
}
df2 = pd.DataFrame(data_dict)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-08-09,1.0,3,test,Foo
1,1.0,2021-08-09,1.0,3,train,Foo
2,1.0,2021-08-09,1.0,3,test,Foo
3,1.0,2021-08-09,1.0,3,train,Foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#### 注释

In [7]:
pd.Timestamp("20210809")   # 创建时间戳

Timestamp('2021-08-09 00:00:00')

In [8]:
pd.Series(1, index=list(range(4)), dtype="float32") # 创建一维数组

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32

In [9]:
np.array([3] * 4, dtype="int32")  # 数组创建

array([3, 3, 3, 3])

In [10]:
pd.Categorical(["test", "train", "test", "train"]) # 创建category数据类型

['test', 'train', 'test', 'train']
Categories (2, object): ['test', 'train']

### 5、DataFrame对象对应方法

df2.A                  df2.bool
df2.abs                df2.boxplot
df2.add                df2.C
df2.add_prefix         df2.clip
df2.add_suffix         df2.columns
df2.align              df2.copy
df2.all                df2.count
df2.any                df2.combine
df2.append             df2.D
df2.apply              df2.describe
df2.applymap           df2.diff
df2.B                  df2.duplicated

### 6、查看DataFrame对象中的数据

In [11]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [12]:
df.head() # 默认前5行数据

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313


In [13]:
df.head(3) # 指定显示前行数

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018


In [14]:
df.tail() # 默认显示数据最后5行

Unnamed: 0,A,B,C,D
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [15]:
df.tail(3) # 指定显示后3行

Unnamed: 0,A,B,C,D
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [16]:
df.index # 查看数据索引

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns # 查看数据列名

Index(['A', 'B', 'C', 'D'], dtype='object')

### 7、将DataFrame数据类型转化为numpy array数组

In [18]:
df.to_numpy() # 纯浮点数类型转化速度较快，多数据类型转化速度慢

array([[ 0.34641994, -0.77343792, -1.6711566 , -0.40038629],
       [ 1.0435113 ,  1.09694385,  0.44193409, -0.44542772],
       [ 0.71710026,  0.04802185,  2.03837738,  0.50901776],
       [-1.24409866, -0.9843915 , -3.03093705, -0.75390885],
       [-0.16715573, -0.53130002, -0.18268267, -0.78231264],
       [ 1.86650911,  0.79808258, -0.45508848, -0.90868998]])

### 8、DataFrame快速统计数据

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.427048,-0.05768,-0.476592,-0.463618
std,1.066632,0.856981,1.743976,0.51658
min,-1.244099,-0.984392,-3.030937,-0.90869
25%,-0.038762,-0.712903,-1.36714,-0.775212
50%,0.53176,-0.241639,-0.318886,-0.599668
75%,0.961909,0.610567,0.28578,-0.411647
max,1.866509,1.096944,2.038377,0.509018


### 9、将索引与列名的位置互换

In [20]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [21]:
df.T # Transposing

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.34642,1.043511,0.7171,-1.244099,-0.167156,1.866509
B,-0.773438,1.096944,0.048022,-0.984392,-0.5313,0.798083
C,-1.671157,0.441934,2.038377,-3.030937,-0.182683,-0.455088
D,-0.400386,-0.445428,0.509018,-0.753909,-0.782313,-0.90869


### 10、数据根据规则排序

In [22]:
# 根据axis=1（列名）排序
df.sort_index(axis=1, ascending=False) 

Unnamed: 0,D,C,B,A
2013-01-01,-0.400386,-1.671157,-0.773438,0.34642
2013-01-02,-0.445428,0.441934,1.096944,1.043511
2013-01-03,0.509018,2.038377,0.048022,0.7171
2013-01-04,-0.753909,-3.030937,-0.984392,-1.244099
2013-01-05,-0.782313,-0.182683,-0.5313,-0.167156
2013-01-06,-0.90869,-0.455088,0.798083,1.866509


In [23]:
# 根据axis=0（索引）排序
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,1.866509,0.798083,-0.455088,-0.90869
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386


In [24]:
# 指定列的值进行排序，默认升序排列
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [25]:
# 指定列的值进行排序
df.sort_values(by="A", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,1.866509,0.798083,-0.455088,-0.90869
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909


### 11、数据的选择

### 12、获取指定列数据

In [26]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [27]:
df["A"]

2013-01-01    0.346420
2013-01-02    1.043511
2013-01-03    0.717100
2013-01-04   -1.244099
2013-01-05   -0.167156
2013-01-06    1.866509
Freq: D, Name: A, dtype: float64

In [28]:
df.A

2013-01-01    0.346420
2013-01-02    1.043511
2013-01-03    0.717100
2013-01-04   -1.244099
2013-01-05   -0.167156
2013-01-06    1.866509
Freq: D, Name: A, dtype: float64

### 13、通过切片方式获取指定行数据

In [29]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [30]:
df[1:3]

Unnamed: 0,A,B,C,D
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018


In [31]:
df["20130102":"20130103"]

Unnamed: 0,A,B,C,D
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018


### 14、根据标签选择数据

In [32]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [33]:
df.loc[dates[0]] # 索引

A    0.346420
B   -0.773438
C   -1.671157
D   -0.400386
Name: 2013-01-01 00:00:00, dtype: float64

In [34]:
df.loc[:, ["A", "C"]] # 参数1：索引范围，参数2：列名

Unnamed: 0,A,C
2013-01-01,0.34642,-1.671157
2013-01-02,1.043511,0.441934
2013-01-03,0.7171,2.038377
2013-01-04,-1.244099,-3.030937
2013-01-05,-0.167156,-0.182683
2013-01-06,1.866509,-0.455088


In [36]:
df.loc["20130102": "20130104", ["A", "C"]]

Unnamed: 0,A,C
2013-01-02,1.043511,0.441934
2013-01-03,0.7171,2.038377
2013-01-04,-1.244099,-3.030937


In [38]:
df.loc["20130101", ["A", "C"]] 

A    0.346420
C   -1.671157
Name: 2013-01-01 00:00:00, dtype: float64

### 15、获取指定值

In [39]:
df.loc[dates[0], "A"]

0.3464199409258815

In [40]:
df.at[dates[0], "A"]

0.3464199409258815

### 16、通过索引获取数据

In [41]:
df.iloc[0]

A    0.346420
B   -0.773438
C   -1.671157
D   -0.400386
Name: 2013-01-01 00:00:00, dtype: float64

### 17、通过切片获取数据

In [42]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.244099,-0.984392
2013-01-05,-0.167156,-0.5313


In [43]:
df.iloc[1:3]

Unnamed: 0,A,B,C,D
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018


In [45]:
df.iloc[[0, 1, 3], [2, 3]]

Unnamed: 0,C,D
2013-01-01,-1.671157,-0.400386
2013-01-02,0.441934,-0.445428
2013-01-04,-3.030937,-0.753909


### 18、通过索引获取指定值

In [46]:
df.iloc[0, 0]

0.3464199409258815

In [47]:
df.iat[0, 0]

0.3464199409258815

### 19、条件筛选数据

In [48]:
df[df["A"] > 0] # 指定列数据

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [49]:
df[df["A"] < 0]

Unnamed: 0,A,B,C,D
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313


In [50]:
df[df > 0]  # 筛选整个对象数据

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,,,
2013-01-02,1.043511,1.096944,0.441934,
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,,,,
2013-01-05,,,,
2013-01-06,1.866509,0.798083,,


### 20、通过 isin() 函数筛选数据

In [51]:
df2 = df.copy()  # 复制对象

In [52]:
df2["E"] = ["one", "one", "two", "three", "four", "three"] # 插入列数据

In [53]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386,one
2013-01-02,1.043511,1.096944,0.441934,-0.445428,one
2013-01-03,0.7171,0.048022,2.038377,0.509018,two
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909,three
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313,four
2013-01-06,1.866509,0.798083,-0.455088,-0.90869,three


In [54]:
df2[df2["E"].isin(["one"])]

Unnamed: 0,A,B,C,D,E
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386,one
2013-01-02,1.043511,1.096944,0.441934,-0.445428,one


### 21、插入数据会通过索引自动对齐

In [55]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130102", periods=6))

In [56]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [57]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386
2013-01-02,1.043511,1.096944,0.441934,-0.445428
2013-01-03,0.7171,0.048022,2.038377,0.509018
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313
2013-01-06,1.866509,0.798083,-0.455088,-0.90869


In [58]:
df["F"] = s1  # 超出索引范围数据将删除

In [59]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.34642,-0.773438,-1.671157,-0.400386,
2013-01-02,1.043511,1.096944,0.441934,-0.445428,1.0
2013-01-03,0.7171,0.048022,2.038377,0.509018,2.0
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909,3.0
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313,4.0
2013-01-06,1.866509,0.798083,-0.455088,-0.90869,5.0


### 22、通过标签修改数据

In [60]:
df.at[dates[0], "A"] = 0

In [61]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.773438,-1.671157,-0.400386,
2013-01-02,1.043511,1.096944,0.441934,-0.445428,1.0
2013-01-03,0.7171,0.048022,2.038377,0.509018,2.0
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909,3.0
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313,4.0
2013-01-06,1.866509,0.798083,-0.455088,-0.90869,5.0


### 23、通过位置修改数据

In [62]:
df.iat[0, 1] = 0

In [63]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.671157,-0.400386,
2013-01-02,1.043511,1.096944,0.441934,-0.445428,1.0
2013-01-03,0.7171,0.048022,2.038377,0.509018,2.0
2013-01-04,-1.244099,-0.984392,-3.030937,-0.753909,3.0
2013-01-05,-0.167156,-0.5313,-0.182683,-0.782313,4.0
2013-01-06,1.866509,0.798083,-0.455088,-0.90869,5.0


### 24、批量修改数据

In [65]:
df.loc[:, "D"] = np.array([5] * len(df))

In [66]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.671157,5,
2013-01-02,1.043511,1.096944,0.441934,5,1.0
2013-01-03,0.7171,0.048022,2.038377,5,2.0
2013-01-04,-1.244099,-0.984392,-3.030937,5,3.0
2013-01-05,-0.167156,-0.5313,-0.182683,5,4.0
2013-01-06,1.866509,0.798083,-0.455088,5,5.0


In [70]:
df2 = df.copy()

In [72]:
df2 = -df2

In [73]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.0,-0.0,1.671157,5.0,
2013-01-02,1.043511,1.096944,0.441934,5.0,1.0
2013-01-03,0.7171,0.048022,2.038377,5.0,2.0
2013-01-04,1.244099,0.984392,3.030937,5.0,3.0
2013-01-05,0.167156,0.5313,0.182683,5.0,4.0
2013-01-06,1.866509,0.798083,0.455088,5.0,5.0


In [74]:
df2 = -df2

In [75]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.671157,-5.0,
2013-01-02,-1.043511,-1.096944,-0.441934,-5.0,-1.0
2013-01-03,-0.7171,-0.048022,-2.038377,-5.0,-2.0
2013-01-04,-1.244099,-0.984392,-3.030937,-5.0,-3.0
2013-01-05,-0.167156,-0.5313,-0.182683,-5.0,-4.0
2013-01-06,-1.866509,-0.798083,-0.455088,-5.0,-5.0


In [76]:
df2 = -df2

In [77]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.0,-0.0,1.671157,5.0,
2013-01-02,1.043511,1.096944,0.441934,5.0,1.0
2013-01-03,0.7171,0.048022,2.038377,5.0,2.0
2013-01-04,1.244099,0.984392,3.030937,5.0,3.0
2013-01-05,0.167156,0.5313,0.182683,5.0,4.0
2013-01-06,1.866509,0.798083,0.455088,5.0,5.0


### 25、删除具有数据缺失的行

In [78]:
df2.dropna(how="any")   # 返回操作之后的数据，不改变原数据

Unnamed: 0,A,B,C,D,F
2013-01-02,1.043511,1.096944,0.441934,5.0,1.0
2013-01-03,0.7171,0.048022,2.038377,5.0,2.0
2013-01-04,1.244099,0.984392,3.030937,5.0,3.0
2013-01-05,0.167156,0.5313,0.182683,5.0,4.0
2013-01-06,1.866509,0.798083,0.455088,5.0,5.0


### 26、指定值填充数据缺失位置

In [82]:
df2.fillna(value="test") # 返回操作之后的数据，不改变原数据

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.0,-0.0,1.671157,5.0,test
2013-01-02,1.043511,1.096944,0.441934,5.0,1.0
2013-01-03,0.7171,0.048022,2.038377,5.0,2.0
2013-01-04,1.244099,0.984392,3.030937,5.0,3.0
2013-01-05,0.167156,0.5313,0.182683,5.0,4.0
2013-01-06,1.866509,0.798083,0.455088,5.0,5.0


### 27、判断整个数据对象的值是否为缺失值

In [84]:
pd.isna(df2)

Unnamed: 0,A,B,C,D,F
2013-01-01,False,False,False,False,True
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,False
2013-01-04,False,False,False,False,False
2013-01-05,False,False,False,False,False
2013-01-06,False,False,False,False,False


### 28、获取描述性统计数据

In [86]:
df2.mean() # 通过列进行计算

A    0.839729
B    0.576457
C    1.303363
D    5.000000
F    3.000000
dtype: float64

In [88]:
df2.mean(1) # 通过行进行计算

2013-01-01    1.667789
2013-01-02    1.716478
2013-01-03    1.960700
2013-01-04    2.651885
2013-01-05    1.976228
2013-01-06    2.623936
Freq: D, dtype: float64

### 29、通过指定数据格式格式化DataFrame数据

In [97]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(-3) # shift(): 在数据生成过程中的位移数

In [98]:
s

2013-01-01    NaN
2013-01-02    6.0
2013-01-03    8.0
2013-01-04    NaN
2013-01-05    NaN
2013-01-06    NaN
Freq: D, dtype: float64

In [100]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,-4.956489,-4.903056,-5.558066,-1.0,-5.0
2013-01-03,-7.2829,-7.951978,-5.961623,-3.0,-6.0
2013-01-04,,,,,
2013-01-05,,,,,
2013-01-06,,,,,


### 30、对数据进行处理

In [102]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.671157,5,
2013-01-02,1.043511,1.096944,0.441934,5,1.0
2013-01-03,0.7171,0.048022,2.038377,5,2.0
2013-01-04,-1.244099,-0.984392,-3.030937,5,3.0
2013-01-05,-0.167156,-0.5313,-0.182683,5,4.0
2013-01-06,1.866509,0.798083,-0.455088,5,5.0


In [103]:
df.apply(np.cumsum)   # 数据逐条累加

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.671157,5,
2013-01-02,1.043511,1.096944,-1.229223,10,1.0
2013-01-03,1.760612,1.144966,0.809155,15,3.0
2013-01-04,0.516513,0.160574,-2.221782,20,6.0
2013-01-05,0.349357,-0.370726,-2.404465,25,10.0
2013-01-06,2.215866,0.427357,-2.859553,30,15.0


In [104]:
df.apply(lambda x: x.max() - x.min())

A    3.110608
B    2.081335
C    5.069314
D    0.000000
F    4.000000
dtype: float64

### 31、统计元素值出现次数

In [105]:
s = pd.Series(np.random.randint(0, 7, size=10))

In [106]:
s

0    5
1    4
2    3
3    2
4    1
5    4
6    1
7    4
8    0
9    6
dtype: int32

In [107]:
s.value_counts()

4    3
1    2
0    1
2    1
3    1
5    1
6    1
dtype: int64

### 32、操作数据对象中的字符串

In [108]:
s = pd.Series(["A", "B", "C", "Aba", np.nan, "cat"])

In [109]:
s.str.lower()

0      a
1      b
2      c
3    aba
4    NaN
5    cat
dtype: object

### 33、行之间数据连接

In [110]:
df = pd.DataFrame(np.random.randn(10, 4))

In [111]:
df

Unnamed: 0,0,1,2,3
0,0.937746,0.72847,-0.321864,0.368064
1,-1.236044,-0.310402,0.301394,-0.085649
2,-0.135228,-1.111247,0.040793,0.036359
3,-0.242592,-0.580969,-0.559883,-0.813214
4,0.692862,0.721485,0.647349,1.116522
5,-0.691467,-1.973678,-0.022896,1.540627
6,1.049544,0.405071,0.505944,-0.568906
7,-0.103845,0.813997,-2.207577,-1.070558
8,0.164952,1.183412,-1.713572,0.343722
9,0.310229,-0.759742,-0.409551,-1.294061


In [112]:
pieces = [df[3:7], df[:3], df[7:]]

In [113]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
3,-0.242592,-0.580969,-0.559883,-0.813214
4,0.692862,0.721485,0.647349,1.116522
5,-0.691467,-1.973678,-0.022896,1.540627
6,1.049544,0.405071,0.505944,-0.568906
0,0.937746,0.72847,-0.321864,0.368064
1,-1.236044,-0.310402,0.301394,-0.085649
2,-0.135228,-1.111247,0.040793,0.036359
7,-0.103845,0.813997,-2.207577,-1.070558
8,0.164952,1.183412,-1.713572,0.343722
9,0.310229,-0.759742,-0.409551,-1.294061


### 34、列之间数据连接

In [114]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})

In [115]:
right = pd.DataFrame({"key": ["foo", "foo"], "lval": [4, 5]})

In [116]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [117]:
right

Unnamed: 0,key,lval
0,foo,4
1,foo,5


In [118]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval_x,lval_y
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [119]:
left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})

In [120]:
right = pd.DataFrame({"key": ["foo", "bar"], "lval": [4, 5]})

In [121]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval_x,lval_y
0,foo,1,4
1,bar,2,5
