### 1、导入三方库

In [1]:
import numpy as np
import pandas as pd

### 2、通过Series函数创建对象

In [2]:
s = pd.Series([1, 3, 4, np.nan, 6, 8])
s

0    1.0
1    3.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64

### 3、通过DataFrame创建对象（参数为Numpy数组）

In [3]:
dates = pd.date_range("20130101", periods=6) # 创建时间列表，参数1为起始日期，参数2为列表元素数量
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) # randn()返回值为标准正态分布中的值，参数为生成的数组维度
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643


### 4、通过DataFrame创建对象（参数为字典）

In [5]:
data_dict = {
    "A": 1.0,
    "B": pd.Timestamp("20210809"), 
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),  
    "D": np.array([3] * 4, dtype="int32"),                     
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "Foo",
}
df2 = pd.DataFrame(data_dict)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-08-09,1.0,3,test,Foo
1,1.0,2021-08-09,1.0,3,train,Foo
2,1.0,2021-08-09,1.0,3,test,Foo
3,1.0,2021-08-09,1.0,3,train,Foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#### 注释

In [7]:
pd.Timestamp("20210809")   # 创建时间戳

Timestamp('2021-08-09 00:00:00')

In [8]:
pd.Series(1, index=list(range(4)), dtype="float32") # 创建一维数组

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32

In [9]:
np.array([3] * 4, dtype="int32")  # 数组创建

array([3, 3, 3, 3])

In [10]:
pd.Categorical(["test", "train", "test", "train"]) # 创建category数据类型

['test', 'train', 'test', 'train']
Categories (2, object): ['test', 'train']

### 5、DataFrame对象对应方法

df2.A                  df2.bool
df2.abs                df2.boxplot
df2.add                df2.C
df2.add_prefix         df2.clip
df2.add_suffix         df2.columns
df2.align              df2.copy
df2.all                df2.count
df2.any                df2.combine
df2.append             df2.D
df2.apply              df2.describe
df2.applymap           df2.diff
df2.B                  df2.duplicated

### 6、查看DataFrame对象中的数据

In [11]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643


In [12]:
df.head() # 默认前5行数据

Unnamed: 0,A,B,C,D
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425


In [13]:
df.head(3) # 指定显示前行数

Unnamed: 0,A,B,C,D
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646


In [14]:
df.tail() # 默认显示数据最后5行

Unnamed: 0,A,B,C,D
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643


In [15]:
df.tail(3) # 指定显示后3行

Unnamed: 0,A,B,C,D
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643


In [16]:
df.index # 查看数据索引

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns # 查看数据列名

Index(['A', 'B', 'C', 'D'], dtype='object')

### 7、将DataFrame数据类型转化为numpy array数组

In [18]:
df.to_numpy() # 纯浮点数类型转化速度较快，多数据类型转化速度慢

array([[-0.70011053,  0.87375485, -0.08456999,  1.68160558],
       [-0.57451328,  0.45428611,  0.01105858,  2.26669161],
       [-1.98897601,  0.3737144 , -0.94675313, -0.7826456 ],
       [ 1.3387779 , -0.68942583,  1.07372655,  1.70076828],
       [ 0.50348995, -0.47966407, -2.79534993,  0.32142462],
       [ 0.86243258, -1.27198756, -1.98177952,  0.91664347]])

### 8、DataFrame快速统计数据

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.09315,-0.12322,-0.787278,1.017415
std,1.225871,0.817524,1.419796,1.113875
min,-1.988976,-1.271988,-2.79535,-0.782646
25%,-0.668711,-0.636985,-1.723023,0.470229
50%,-0.035512,-0.052975,-0.515662,1.299125
75%,0.772697,0.434143,-0.012849,1.695978
max,1.338778,0.873755,1.073727,2.266692


### 9、将索引与列名的位置互换

In [20]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643


In [21]:
df.T # Transposing

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.700111,-0.574513,-1.988976,1.338778,0.50349,0.862433
B,0.873755,0.454286,0.373714,-0.689426,-0.479664,-1.271988
C,-0.08457,0.011059,-0.946753,1.073727,-2.79535,-1.98178
D,1.681606,2.266692,-0.782646,1.700768,0.321425,0.916643


### 10、数据根据规则排序

In [30]:
# 根据axis=1（列名）排序
df.sort_index(axis=1, ascending=False) 

Unnamed: 0,D,C,B,A
2013-01-01,1.681606,-0.08457,0.873755,-0.700111
2013-01-02,2.266692,0.011059,0.454286,-0.574513
2013-01-03,-0.782646,-0.946753,0.373714,-1.988976
2013-01-04,1.700768,1.073727,-0.689426,1.338778
2013-01-05,0.321425,-2.79535,-0.479664,0.50349
2013-01-06,0.916643,-1.98178,-1.271988,0.862433


In [32]:
# 根据axis=0（索引）排序
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.862433,-1.271988,-1.98178,0.916643
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-01,-0.700111,0.873755,-0.08457,1.681606


In [33]:
# 指定列的值进行排序，默认升序排列
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643
2013-01-04,1.338778,-0.689426,1.073727,1.700768


In [35]:
# 指定列的值进行排序
df.sort_values(by="A", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-06,0.862433,-1.271988,-1.98178,0.916643
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646


### 11、数据的选择

### 12、获取指定列数据

In [38]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643


In [36]:
df["A"]

2013-01-01   -0.700111
2013-01-02   -0.574513
2013-01-03   -1.988976
2013-01-04    1.338778
2013-01-05    0.503490
2013-01-06    0.862433
Freq: D, Name: A, dtype: float64

In [37]:
df.A

2013-01-01   -0.700111
2013-01-02   -0.574513
2013-01-03   -1.988976
2013-01-04    1.338778
2013-01-05    0.503490
2013-01-06    0.862433
Freq: D, Name: A, dtype: float64

### 13、通过切片方式获取指定行数据

In [42]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.700111,0.873755,-0.08457,1.681606
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
2013-01-04,1.338778,-0.689426,1.073727,1.700768
2013-01-05,0.50349,-0.479664,-2.79535,0.321425
2013-01-06,0.862433,-1.271988,-1.98178,0.916643


In [40]:
df[1:3]

Unnamed: 0,A,B,C,D
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646


In [41]:
df["20130102":"20130103"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.574513,0.454286,0.011059,2.266692
2013-01-03,-1.988976,0.373714,-0.946753,-0.782646
