In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 透過list建立一個Series

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### 透過一個numpy array 建立DataFrame

In [4]:
dates = pd.date_range('20130101', periods = 6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.018571,-0.788273,-0.419688,0.144916
2013-01-02,-0.406976,-0.125291,0.810159,-0.989375
2013-01-03,-1.120791,0.777143,0.763824,-0.254394
2013-01-04,-0.152957,0.666954,-0.065168,-0.971477
2013-01-05,0.312477,-2.775644,-2.137286,-0.887267
2013-01-06,1.083818,-0.966648,1.494735,1.140283


### 通過傳遞一個能夠被轉換成類似序列結構的字典對象來建立一個DataFrame

In [8]:
df2 = pd.DataFrame({"A" : 1.,\
                    "B" : pd.Timestamp('20130102'),\
                    "C" : pd.Series(1, index = list(range(4)), dtype = "float32"),\
                    "D" : np.array([3] * 4, dtype = "int32"),\
                    "E" : pd.Categorical(["test", "train", "test", "train"]),\
                    "F" : "foo"})

In [9]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 查看不同列的數據類型

In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### 查看DataFrame的頭部和尾部的行

In [11]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.018571,-0.788273,-0.419688,0.144916
2013-01-02,-0.406976,-0.125291,0.810159,-0.989375
2013-01-03,-1.120791,0.777143,0.763824,-0.254394
2013-01-04,-0.152957,0.666954,-0.065168,-0.971477
2013-01-05,0.312477,-2.775644,-2.137286,-0.887267


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.152957,0.666954,-0.065168,-0.971477
2013-01-05,0.312477,-2.775644,-2.137286,-0.887267
2013-01-06,1.083818,-0.966648,1.494735,1.140283


### 顯示索引、列和底層的numpy數據

In [13]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
df.values

array([[ 0.01857078, -0.78827326, -0.41968777,  0.14491557],
       [-0.40697626, -0.12529126,  0.81015886, -0.98937542],
       [-1.1207908 ,  0.77714289,  0.76382357, -0.2543942 ],
       [-0.15295733,  0.66695412, -0.06516829, -0.97147733],
       [ 0.31247671, -2.77564445, -2.13728622, -0.8872668 ],
       [ 1.0838179 , -0.96664816,  1.4947349 ,  1.14028295]])

### describe() 函數對於數據的快速統計匯總

In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.04431,-0.535293,0.074429,-0.302886
std,0.735672,1.311886,1.279778,0.842069
min,-1.120791,-2.775644,-2.137286,-0.989375
25%,-0.343472,-0.922054,-0.331058,-0.950425
50%,-0.067193,-0.456782,0.349328,-0.570831
75%,0.239,0.468893,0.798575,0.045088
max,1.083818,0.777143,1.494735,1.140283


### 對數據的轉置

In [17]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.018571,-0.406976,-1.120791,-0.152957,0.312477,1.083818
B,-0.788273,-0.125291,0.777143,0.666954,-2.775644,-0.966648
C,-0.419688,0.810159,0.763824,-0.065168,-2.137286,1.494735
D,0.144916,-0.989375,-0.254394,-0.971477,-0.887267,1.140283


### 按軸進行排序

In [18]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,0.144916,-0.419688,-0.788273,0.018571
2013-01-02,-0.989375,0.810159,-0.125291,-0.406976
2013-01-03,-0.254394,0.763824,0.777143,-1.120791
2013-01-04,-0.971477,-0.065168,0.666954,-0.152957
2013-01-05,-0.887267,-2.137286,-2.775644,0.312477
2013-01-06,1.140283,1.494735,-0.966648,1.083818


### 按值進行排序

In [19]:
df.sort_values(by = "B")

Unnamed: 0,A,B,C,D
2013-01-05,0.312477,-2.775644,-2.137286,-0.887267
2013-01-06,1.083818,-0.966648,1.494735,1.140283
2013-01-01,0.018571,-0.788273,-0.419688,0.144916
2013-01-02,-0.406976,-0.125291,0.810159,-0.989375
2013-01-04,-0.152957,0.666954,-0.065168,-0.971477
2013-01-03,-1.120791,0.777143,0.763824,-0.254394


# 選擇

### 獲取
### 選擇一個單獨的列，將會返回Series，等同於df.A

In [20]:
df["A"]

2013-01-01    0.018571
2013-01-02   -0.406976
2013-01-03   -1.120791
2013-01-04   -0.152957
2013-01-05    0.312477
2013-01-06    1.083818
Freq: D, Name: A, dtype: float64

In [21]:
df.A

2013-01-01    0.018571
2013-01-02   -0.406976
2013-01-03   -1.120791
2013-01-04   -0.152957
2013-01-05    0.312477
2013-01-06    1.083818
Freq: D, Name: A, dtype: float64

### 通過[ ] 進行選擇，將會對行進行切片

In [22]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.018571,-0.788273,-0.419688,0.144916
2013-01-02,-0.406976,-0.125291,0.810159,-0.989375
2013-01-03,-1.120791,0.777143,0.763824,-0.254394


In [23]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.406976,-0.125291,0.810159,-0.989375
2013-01-03,-1.120791,0.777143,0.763824,-0.254394
2013-01-04,-0.152957,0.666954,-0.065168,-0.971477


# 通過標籤選擇

### 使用標籤來獲取一個交叉的區域

In [24]:
df.loc[dates[0]]

A    0.018571
B   -0.788273
C   -0.419688
D    0.144916
Name: 2013-01-01 00:00:00, dtype: float64

### 通過標籤來在多個軸上進行選擇

In [25]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,0.018571,-0.788273
2013-01-02,-0.406976,-0.125291
2013-01-03,-1.120791,0.777143
2013-01-04,-0.152957,0.666954
2013-01-05,0.312477,-2.775644
2013-01-06,1.083818,-0.966648


### 標籤切片

In [26]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,-0.406976,-0.125291
2013-01-03,-1.120791,0.777143
2013-01-04,-0.152957,0.666954


### 對於返回的對象進行維度縮減

In [27]:
df.loc["20130102", ["A", "B"]]

A   -0.406976
B   -0.125291
Name: 2013-01-02 00:00:00, dtype: float64

## 獲取一個標量

In [28]:
df.loc[dates[0], "A"]

0.018570775219812482

### 快速訪問一個標量 等同上

In [29]:
df.at[dates[0], "A"]

0.018570775219812482

# 通過位置選擇

### 通過傳遞數值進行位置選擇(選擇的是行)

In [30]:
df.iloc[3]

A   -0.152957
B    0.666954
C   -0.065168
D   -0.971477
Name: 2013-01-04 00:00:00, dtype: float64

### 通過數值進行切片

In [31]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.152957,0.666954
2013-01-05,0.312477,-2.775644


### 通過指定一個位置的列表

In [32]:
df.iloc[[1, 2, 4],[0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.406976,0.810159
2013-01-03,-1.120791,0.763824
2013-01-05,0.312477,-2.137286


### 對行進行切片

In [33]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.406976,-0.125291,0.810159,-0.989375
2013-01-03,-1.120791,0.777143,0.763824,-0.254394


### 對列進行切片

In [34]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.788273,-0.419688
2013-01-02,-0.125291,0.810159
2013-01-03,0.777143,0.763824
2013-01-04,0.666954,-0.065168
2013-01-05,-2.775644,-2.137286
2013-01-06,-0.966648,1.494735


### 獲取特定的值

In [35]:
df.iloc[1, 1]

-0.12529126314371331

In [36]:
df.iat[1, 1]

-0.12529126314371331

# Boolean索引

### 使用一個單獨列來選擇數據

In [37]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.018571,-0.788273,-0.419688,0.144916
2013-01-05,0.312477,-2.775644,-2.137286,-0.887267
2013-01-06,1.083818,-0.966648,1.494735,1.140283


### 使用 where操作來選擇數據

In [38]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.018571,,,0.144916
2013-01-02,,,0.810159,
2013-01-03,,0.777143,0.763824,
2013-01-04,,0.666954,,
2013-01-05,0.312477,,,
2013-01-06,1.083818,,1.494735,1.140283


### 使用isin() 方法來過濾

In [39]:
df2 = df.copy()

In [40]:
df2["E"] = ["one", "one", "two", "three", "four", "three"]

In [41]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.018571,-0.788273,-0.419688,0.144916,one
2013-01-02,-0.406976,-0.125291,0.810159,-0.989375,one
2013-01-03,-1.120791,0.777143,0.763824,-0.254394,two
2013-01-04,-0.152957,0.666954,-0.065168,-0.971477,three
2013-01-05,0.312477,-2.775644,-2.137286,-0.887267,four
2013-01-06,1.083818,-0.966648,1.494735,1.140283,three


In [42]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.120791,0.777143,0.763824,-0.254394,two
2013-01-05,0.312477,-2.775644,-2.137286,-0.887267,four


# 設置

### 設置一個新的列

In [43]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range("20130102", periods = 6))

In [44]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [45]:
df["F"] = s1

### 通過標籤設置新的值

In [46]:
df.at[dates[0], "A"] = 0

### 通過位置設置新的值

In [47]:
df.iat[0, 1] = 0

### 通過一個numpy數組設置一組新值

In [48]:
df.loc[:, "D"] = np.array([5] * len(df))

In [49]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.419688,5,
2013-01-02,-0.406976,-0.125291,0.810159,5,1.0
2013-01-03,-1.120791,0.777143,0.763824,5,2.0
2013-01-04,-0.152957,0.666954,-0.065168,5,3.0
2013-01-05,0.312477,-2.775644,-2.137286,5,4.0
2013-01-06,1.083818,-0.966648,1.494735,5,5.0


### 通過where操作來設置新的值

In [50]:
df2 =df.copy()

In [51]:
df2[df2 > 0] = -df2

In [52]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.419688,-5,
2013-01-02,-0.406976,-0.125291,-0.810159,-5,-1.0
2013-01-03,-1.120791,-0.777143,-0.763824,-5,-2.0
2013-01-04,-0.152957,-0.666954,-0.065168,-5,-3.0
2013-01-05,-0.312477,-2.775644,-2.137286,-5,-4.0
2013-01-06,-1.083818,-0.966648,-1.494735,-5,-5.0


# 缺失值處理

### reindex()方法可以對指定軸上的索引進行改變/增加/刪除操作，這將返回原始數據的一個拷貝

In [53]:
df1 = df.reindex(index = dates[0:4], columns = list(df.columns) + ["E"])

In [54]:
df1.loc[dates[0]:dates[1], "E"] = 1

In [55]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.419688,5,,1.0
2013-01-02,-0.406976,-0.125291,0.810159,5,1.0,1.0
2013-01-03,-1.120791,0.777143,0.763824,5,2.0,
2013-01-04,-0.152957,0.666954,-0.065168,5,3.0,


### 去掉包函缺失值的行

In [56]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-0.406976,-0.125291,0.810159,5,1.0,1.0


### 對缺失值進行填充

In [57]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.419688,5,5.0,1.0
2013-01-02,-0.406976,-0.125291,0.810159,5,1.0,1.0
2013-01-03,-1.120791,0.777143,0.763824,5,2.0,5.0
2013-01-04,-0.152957,0.666954,-0.065168,5,3.0,5.0


### 對數據進行Boolean填充 

In [58]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# 統計 

### 執行描述性統計 

In [59]:
df.mean()

A   -0.047405
B   -0.403914
C    0.074429
D    5.000000
F    3.000000
dtype: float64

### 在其他軸上進行相同的操作 

In [60]:
df.mean(1)

2013-01-01    1.145078
2013-01-02    1.255578
2013-01-03    1.484035
2013-01-04    1.689766
2013-01-05    0.879909
2013-01-06    2.322381
Freq: D, dtype: float64

### 對於擁有不同維度，需要對齊的對象進行操作。Pandas會自動延著指定的維度進行廣播

In [61]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)

In [62]:
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [63]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-2.120791,-0.222857,-0.236176,4.0,1.0
2013-01-04,-3.152957,-2.333046,-3.065168,2.0,0.0
2013-01-05,-4.687523,-7.775644,-7.137286,0.0,-1.0
2013-01-06,,,,,


# Apply 

###  對數據應用函數

In [64]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.419688,5,
2013-01-02,-0.406976,-0.125291,0.810159,5,1.0
2013-01-03,-1.120791,0.777143,0.763824,5,2.0
2013-01-04,-0.152957,0.666954,-0.065168,5,3.0
2013-01-05,0.312477,-2.775644,-2.137286,5,4.0
2013-01-06,1.083818,-0.966648,1.494735,5,5.0


In [65]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.419688,5,
2013-01-02,-0.406976,-0.125291,0.390471,10,1.0
2013-01-03,-1.527767,0.651852,1.154295,15,3.0
2013-01-04,-1.680724,1.318806,1.089126,20,6.0
2013-01-05,-1.368248,-1.456839,-1.04816,25,10.0
2013-01-06,-0.28443,-2.423487,0.446575,30,15.0


In [66]:
df.apply(lambda x: x.max() - x.min())

A    2.204609
B    3.552787
C    3.632021
D    0.000000
F    4.000000
dtype: float64

# 直方圖

In [67]:
s = pd.Series(np.random.randint(0, 7, size=10))

In [68]:
s

0    6
1    4
2    0
3    5
4    0
5    4
6    5
7    0
8    5
9    4
dtype: int32

In [69]:
s.value_counts()

5    3
4    3
0    3
6    1
dtype: int64

# 字符串方法

### Series對象在其Str屬性中配備了一組字符串處理方法，可以很容易的應用到數組中的每個元素

In [70]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])

In [71]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# 合併 

### Concat

In [72]:
df = pd.DataFrame(np.random.randn(10, 4))

In [73]:
df

Unnamed: 0,0,1,2,3
0,1.163973,-1.410596,-1.92424,0.292076
1,0.344099,1.255498,-0.464713,-0.512682
2,0.229545,-1.204389,-1.631278,1.977439
3,-0.42866,0.04388,0.491335,-1.455277
4,0.179959,0.420172,1.399882,-0.334848
5,0.248183,-0.599854,0.759377,1.518735
6,0.251451,0.629913,-0.634209,0.621715
7,-0.308455,-0.584602,-0.372981,1.393799
8,-1.458815,1.098556,-1.735934,-0.33249
9,-1.550808,0.576572,-0.433748,0.460702


In [74]:
pieces = [df[:3], df[3:7], df[7:]]

In [75]:
pieces

[          0         1         2         3
 0  1.163973 -1.410596 -1.924240  0.292076
 1  0.344099  1.255498 -0.464713 -0.512682
 2  0.229545 -1.204389 -1.631278  1.977439,
           0         1         2         3
 3 -0.428660  0.043880  0.491335 -1.455277
 4  0.179959  0.420172  1.399882 -0.334848
 5  0.248183 -0.599854  0.759377  1.518735
 6  0.251451  0.629913 -0.634209  0.621715,
           0         1         2         3
 7 -0.308455 -0.584602 -0.372981  1.393799
 8 -1.458815  1.098556 -1.735934 -0.332490
 9 -1.550808  0.576572 -0.433748  0.460702]

In [76]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.163973,-1.410596,-1.92424,0.292076
1,0.344099,1.255498,-0.464713,-0.512682
2,0.229545,-1.204389,-1.631278,1.977439
3,-0.42866,0.04388,0.491335,-1.455277
4,0.179959,0.420172,1.399882,-0.334848
5,0.248183,-0.599854,0.759377,1.518735
6,0.251451,0.629913,-0.634209,0.621715
7,-0.308455,-0.584602,-0.372981,1.393799
8,-1.458815,1.098556,-1.735934,-0.33249
9,-1.550808,0.576572,-0.433748,0.460702
