In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 透過list建立一個Series

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### 透過一個numpy array 建立DataFrame

In [4]:
dates = pd.date_range('20130101', periods = 6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.71596,-0.469924,-0.582762,1.019294
2013-01-02,0.001022,-1.062741,0.793603,-0.198195
2013-01-03,0.550796,0.09347,0.619558,1.352055
2013-01-04,-0.763575,-0.989298,1.841826,-1.174214
2013-01-05,-0.276902,-0.249846,0.110651,-0.061475
2013-01-06,-0.37081,-0.236092,-1.516659,-1.205928


### 通過傳遞一個能夠被轉換成類似序列結構的字典對象來建立一個DataFrame

In [8]:
df2 = pd.DataFrame({"A" : 1.,\
                    "B" : pd.Timestamp('20130102'),\
                    "C" : pd.Series(1, index = list(range(4)), dtype = "float32"),\
                    "D" : np.array([3] * 4, dtype = "int32"),\
                    "E" : pd.Categorical(["test", "train", "test", "train"]),\
                    "F" : "foo"})

In [9]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 查看不同列的數據類型

In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### 查看DataFrame的頭部和尾部的行

In [11]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.71596,-0.469924,-0.582762,1.019294
2013-01-02,0.001022,-1.062741,0.793603,-0.198195
2013-01-03,0.550796,0.09347,0.619558,1.352055
2013-01-04,-0.763575,-0.989298,1.841826,-1.174214
2013-01-05,-0.276902,-0.249846,0.110651,-0.061475


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.763575,-0.989298,1.841826,-1.174214
2013-01-05,-0.276902,-0.249846,0.110651,-0.061475
2013-01-06,-0.37081,-0.236092,-1.516659,-1.205928


### 顯示索引、列和底層的numpy數據

In [13]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
df.values

array([[  7.15959632e-01,  -4.69923517e-01,  -5.82762038e-01,
          1.01929414e+00],
       [  1.02201294e-03,  -1.06274110e+00,   7.93602985e-01,
         -1.98195214e-01],
       [  5.50795886e-01,   9.34702185e-02,   6.19557918e-01,
          1.35205497e+00],
       [ -7.63574986e-01,  -9.89297542e-01,   1.84182572e+00,
         -1.17421381e+00],
       [ -2.76902339e-01,  -2.49846025e-01,   1.10650700e-01,
         -6.14748575e-02],
       [ -3.70810117e-01,  -2.36092475e-01,  -1.51665945e+00,
         -1.20592849e+00]])

### describe() 函數對於數據的快速統計匯總

In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.023918,-0.485738,0.211036,-0.044744
std,0.567409,0.456115,1.165408,1.070617
min,-0.763575,-1.062741,-1.516659,-1.205928
25%,-0.347333,-0.859454,-0.409409,-0.930209
50%,-0.13794,-0.359885,0.365104,-0.129835
75%,0.413352,-0.239531,0.750092,0.749102
max,0.71596,0.09347,1.841826,1.352055


### 對數據的轉置

In [17]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.71596,0.001022,0.550796,-0.763575,-0.276902,-0.37081
B,-0.469924,-1.062741,0.09347,-0.989298,-0.249846,-0.236092
C,-0.582762,0.793603,0.619558,1.841826,0.110651,-1.516659
D,1.019294,-0.198195,1.352055,-1.174214,-0.061475,-1.205928


### 按軸進行排序

In [18]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,1.019294,-0.582762,-0.469924,0.71596
2013-01-02,-0.198195,0.793603,-1.062741,0.001022
2013-01-03,1.352055,0.619558,0.09347,0.550796
2013-01-04,-1.174214,1.841826,-0.989298,-0.763575
2013-01-05,-0.061475,0.110651,-0.249846,-0.276902
2013-01-06,-1.205928,-1.516659,-0.236092,-0.37081


### 按值進行排序

In [19]:
df.sort_values(by = "B")

Unnamed: 0,A,B,C,D
2013-01-02,0.001022,-1.062741,0.793603,-0.198195
2013-01-04,-0.763575,-0.989298,1.841826,-1.174214
2013-01-01,0.71596,-0.469924,-0.582762,1.019294
2013-01-05,-0.276902,-0.249846,0.110651,-0.061475
2013-01-06,-0.37081,-0.236092,-1.516659,-1.205928
2013-01-03,0.550796,0.09347,0.619558,1.352055


# 選擇

### 獲取
### 選擇一個單獨的列，將會返回Series，等同於df.A

In [20]:
df["A"]

2013-01-01    0.715960
2013-01-02    0.001022
2013-01-03    0.550796
2013-01-04   -0.763575
2013-01-05   -0.276902
2013-01-06   -0.370810
Freq: D, Name: A, dtype: float64

In [21]:
df.A

2013-01-01    0.715960
2013-01-02    0.001022
2013-01-03    0.550796
2013-01-04   -0.763575
2013-01-05   -0.276902
2013-01-06   -0.370810
Freq: D, Name: A, dtype: float64

### 通過[ ] 進行選擇，將會對行進行切片

In [22]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.71596,-0.469924,-0.582762,1.019294
2013-01-02,0.001022,-1.062741,0.793603,-0.198195
2013-01-03,0.550796,0.09347,0.619558,1.352055


In [23]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0.001022,-1.062741,0.793603,-0.198195
2013-01-03,0.550796,0.09347,0.619558,1.352055
2013-01-04,-0.763575,-0.989298,1.841826,-1.174214


# 通過標籤選擇

### 使用標籤來獲取一個交叉的區域

In [24]:
df.loc[dates[0]]

A    0.715960
B   -0.469924
C   -0.582762
D    1.019294
Name: 2013-01-01 00:00:00, dtype: float64

### 通過標籤來在多個軸上進行選擇

In [25]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,0.71596,-0.469924
2013-01-02,0.001022,-1.062741
2013-01-03,0.550796,0.09347
2013-01-04,-0.763575,-0.989298
2013-01-05,-0.276902,-0.249846
2013-01-06,-0.37081,-0.236092


### 標籤切片

In [26]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,0.001022,-1.062741
2013-01-03,0.550796,0.09347
2013-01-04,-0.763575,-0.989298


### 對於返回的對象進行維度縮減

In [27]:
df.loc["20130102", ["A", "B"]]

A    0.001022
B   -1.062741
Name: 2013-01-02 00:00:00, dtype: float64

## 獲取一個標量

In [28]:
df.loc[dates[0], "A"]

0.71595963209597357

### 快速訪問一個標量 等同上

In [29]:
df.at[dates[0], "A"]

0.71595963209597357

# 通過位置選擇

### 通過傳遞數值進行位置選擇(選擇的是行)

In [30]:
df.iloc[3]

A   -0.763575
B   -0.989298
C    1.841826
D   -1.174214
Name: 2013-01-04 00:00:00, dtype: float64

### 通過數值進行切片

In [31]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.763575,-0.989298
2013-01-05,-0.276902,-0.249846


### 通過指定一個位置的列表

In [32]:
df.iloc[[1, 2, 4],[0, 2]]

Unnamed: 0,A,C
2013-01-02,0.001022,0.793603
2013-01-03,0.550796,0.619558
2013-01-05,-0.276902,0.110651


### 對行進行切片

In [33]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,0.001022,-1.062741,0.793603,-0.198195
2013-01-03,0.550796,0.09347,0.619558,1.352055


### 對列進行切片

In [34]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.469924,-0.582762
2013-01-02,-1.062741,0.793603
2013-01-03,0.09347,0.619558
2013-01-04,-0.989298,1.841826
2013-01-05,-0.249846,0.110651
2013-01-06,-0.236092,-1.516659


### 獲取特定的值

In [35]:
df.iloc[1, 1]

-1.0627411042878336

In [36]:
df.iat[1, 1]

-1.0627411042878336

# Boolean索引

### 使用一個單獨列來選擇數據

In [37]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.71596,-0.469924,-0.582762,1.019294
2013-01-02,0.001022,-1.062741,0.793603,-0.198195
2013-01-03,0.550796,0.09347,0.619558,1.352055


### 使用 where操作來選擇數據

In [38]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.71596,,,1.019294
2013-01-02,0.001022,,0.793603,
2013-01-03,0.550796,0.09347,0.619558,1.352055
2013-01-04,,,1.841826,
2013-01-05,,,0.110651,
2013-01-06,,,,


### 使用isin() 方法來過濾

In [39]:
df2 = df.copy()

In [40]:
df2["E"] = ["one", "one", "two", "three", "four", "three"]

In [41]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.71596,-0.469924,-0.582762,1.019294,one
2013-01-02,0.001022,-1.062741,0.793603,-0.198195,one
2013-01-03,0.550796,0.09347,0.619558,1.352055,two
2013-01-04,-0.763575,-0.989298,1.841826,-1.174214,three
2013-01-05,-0.276902,-0.249846,0.110651,-0.061475,four
2013-01-06,-0.37081,-0.236092,-1.516659,-1.205928,three


In [42]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.550796,0.09347,0.619558,1.352055,two
2013-01-05,-0.276902,-0.249846,0.110651,-0.061475,four


# 設置

### 設置一個新的列

In [43]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range("20130102", periods = 6))

In [44]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [45]:
df["F"] = s1

### 通過標籤設置新的值

In [46]:
df.at[dates[0], "A"] = 0

### 通過位置設置新的值

In [47]:
df.iat[0, 1] = 0

### 通過一個numpy數組設置一組新值

In [48]:
df.loc[:, "D"] = np.array([5] * len(df))

In [49]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.582762,5,
2013-01-02,0.001022,-1.062741,0.793603,5,1.0
2013-01-03,0.550796,0.09347,0.619558,5,2.0
2013-01-04,-0.763575,-0.989298,1.841826,5,3.0
2013-01-05,-0.276902,-0.249846,0.110651,5,4.0
2013-01-06,-0.37081,-0.236092,-1.516659,5,5.0


### 通過where操作來設置新的值

In [50]:
df2 =df.copy()

In [51]:
df2[df2 > 0] = -df2

In [52]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.582762,-5,
2013-01-02,-0.001022,-1.062741,-0.793603,-5,-1.0
2013-01-03,-0.550796,-0.09347,-0.619558,-5,-2.0
2013-01-04,-0.763575,-0.989298,-1.841826,-5,-3.0
2013-01-05,-0.276902,-0.249846,-0.110651,-5,-4.0
2013-01-06,-0.37081,-0.236092,-1.516659,-5,-5.0


# 缺失值處理

### reindex()方法可以對指定軸上的索引進行改變/增加/刪除操作，這將返回原始數據的一個拷貝

In [53]:
df1 = df.reindex(index = dates[0:4], columns = list(df.columns) + ["E"])

In [54]:
df1.loc[dates[0]:dates[1], "E"] = 1

In [55]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.582762,5,,1.0
2013-01-02,0.001022,-1.062741,0.793603,5,1.0,1.0
2013-01-03,0.550796,0.09347,0.619558,5,2.0,
2013-01-04,-0.763575,-0.989298,1.841826,5,3.0,


### 去掉包函缺失值的行

In [56]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.001022,-1.062741,0.793603,5,1.0,1.0


### 對缺失值進行填充

In [57]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.582762,5,5.0,1.0
2013-01-02,0.001022,-1.062741,0.793603,5,1.0,1.0
2013-01-03,0.550796,0.09347,0.619558,5,2.0,5.0
2013-01-04,-0.763575,-0.989298,1.841826,5,3.0,5.0


### 對數據進行Boolean填充 

In [58]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# 統計 

### 執行描述性統計 

In [59]:
df.mean()

A   -0.143245
B   -0.407418
C    0.211036
D    5.000000
F    3.000000
dtype: float64

### 在其他軸上進行相同的操作 

In [60]:
df.mean(1)

2013-01-01    1.104309
2013-01-02    1.146377
2013-01-03    1.652765
2013-01-04    1.617791
2013-01-05    1.716780
2013-01-06    1.575288
Freq: D, dtype: float64

### 對於擁有不同維度，需要對齊的對象進行操作。Pandas會自動延著指定的維度進行廣播

In [65]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)

In [66]:
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [67]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.449204,-0.90653,-0.380442,4.0,1.0
2013-01-04,-3.763575,-3.989298,-1.158174,2.0,0.0
2013-01-05,-5.276902,-5.249846,-4.889349,0.0,-1.0
2013-01-06,,,,,


# Apply 

###  對數據應用函數

In [70]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.582762,5,
2013-01-02,0.001022,-1.062741,0.793603,5,1.0
2013-01-03,0.550796,0.09347,0.619558,5,2.0
2013-01-04,-0.763575,-0.989298,1.841826,5,3.0
2013-01-05,-0.276902,-0.249846,0.110651,5,4.0
2013-01-06,-0.37081,-0.236092,-1.516659,5,5.0


In [68]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.582762,5,
2013-01-02,0.001022,-1.062741,0.210841,10,1.0
2013-01-03,0.551818,-0.969271,0.830399,15,3.0
2013-01-04,-0.211757,-1.958568,2.672225,20,6.0
2013-01-05,-0.488659,-2.208414,2.782875,25,10.0
2013-01-06,-0.85947,-2.444507,1.266216,30,15.0


In [71]:
df.apply(lambda x: x.max() - x.min())

A    1.314371
B    1.156211
C    3.358485
D    0.000000
F    4.000000
dtype: float64

# 直方圖

In [72]:
s = pd.Series(np.random.randint(0, 7, size=10))

In [73]:
s

0    3
1    3
2    4
3    1
4    2
5    6
6    3
7    6
8    6
9    6
dtype: int32

In [74]:
s.value_counts()

6    4
3    3
4    1
2    1
1    1
dtype: int64

# 字符串方法

### Series對象在其Str屬性中配備了一組字符串處理方法，可以很容易的應用到數組中的每個元素

In [76]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])

In [77]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# 合併 

### Concat

In [94]:
df = pd.DataFrame(np.random.randn(10, 4))

In [95]:
df

Unnamed: 0,0,1,2,3
0,1.057124,0.684928,-1.481475,0.909619
1,1.044514,0.499031,0.304684,-1.473411
2,-0.542838,0.080242,-0.404989,-0.175256
3,1.994826,0.427723,-1.102787,0.259132
4,-0.259828,-2.186903,0.597967,0.278468
5,-0.796133,-0.023112,0.463933,-0.519694
6,-0.350947,-1.016919,-0.994948,-1.065776
7,-0.645898,-0.798907,-0.683551,1.099481
8,1.115511,1.909215,-0.707337,-0.763885
9,1.026925,0.741567,0.055754,1.998923


In [98]:
pieces = [df[:3], df[3:7], df[7:]]

In [100]:
pieces

[          0         1         2         3
 0  1.057124  0.684928 -1.481475  0.909619
 1  1.044514  0.499031  0.304684 -1.473411
 2 -0.542838  0.080242 -0.404989 -0.175256,
           0         1         2         3
 3  1.994826  0.427723 -1.102787  0.259132
 4 -0.259828 -2.186903  0.597967  0.278468
 5 -0.796133 -0.023112  0.463933 -0.519694
 6 -0.350947 -1.016919 -0.994948 -1.065776,
           0         1         2         3
 7 -0.645898 -0.798907 -0.683551  1.099481
 8  1.115511  1.909215 -0.707337 -0.763885
 9  1.026925  0.741567  0.055754  1.998923]

In [99]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.057124,0.684928,-1.481475,0.909619
1,1.044514,0.499031,0.304684,-1.473411
2,-0.542838,0.080242,-0.404989,-0.175256
3,1.994826,0.427723,-1.102787,0.259132
4,-0.259828,-2.186903,0.597967,0.278468
5,-0.796133,-0.023112,0.463933,-0.519694
6,-0.350947,-1.016919,-0.994948,-1.065776
7,-0.645898,-0.798907,-0.683551,1.099481
8,1.115511,1.909215,-0.707337,-0.763885
9,1.026925,0.741567,0.055754,1.998923


In [103]:
!pip install --upgrade jupyterthemes

Requirement already up-to-date: jupyterthemes in c:\users\java\anaconda3\lib\site-packages
Requirement already up-to-date: lesscpy>=0.12.0 in c:\users\java\anaconda3\lib\site-packages (from jupyterthemes)
Requirement already up-to-date: matplotlib>=1.4.3 in c:\users\java\anaconda3\lib\site-packages (from jupyterthemes)
Requirement already up-to-date: ipython>=5.4.1 in c:\users\java\anaconda3\lib\site-packages (from jupyterthemes)
Requirement already up-to-date: jupyter-core in c:\users\java\anaconda3\lib\site-packages (from jupyterthemes)
Requirement already up-to-date: six in c:\users\java\anaconda3\lib\site-packages (from lesscpy>=0.12.0->jupyterthemes)
Requirement already up-to-date: ply in c:\users\java\anaconda3\lib\site-packages (from lesscpy>=0.12.0->jupyterthemes)
Requirement already up-to-date: numpy>=1.7.1 in c:\users\java\anaconda3\lib\site-packages (from matplotlib>=1.4.3->jupyterthemes)
Requirement already up-to-date: python-dateutil in c:\users\java\anaconda3\lib\site-pac

In [106]:
usename$ jt -l

SyntaxError: invalid syntax (<ipython-input-106-23b247b6cc13>, line 1)

In [108]:
!jt -l

Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl


In [109]:
!jt -t chesterish -T -N

Traceback (most recent call last):
  File "c:\users\java\anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\users\java\anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\Java\Anaconda3\Scripts\jt.exe\__main__.py", line 9, in <module>
  File "c:\users\java\anaconda3\lib\site-packages\jupyterthemes\__init__.py", line 305, in main
    dfonts=args.defaultfonts)
  File "c:\users\java\anaconda3\lib\site-packages\jupyterthemes\__init__.py", line 101, in install_theme
    style_css = stylefx.less_to_css(style_less)
  File "c:\users\java\anaconda3\lib\site-packages\jupyterthemes\stylefx.py", line 63, in less_to_css
    style_css = lesscpy.compile(tempfile)
  File "c:\users\java\anaconda3\lib\site-packages\lesscpy\__init__.py", line 18, in compile
    p.parse(file=file)
  File "c:\users\java\anaconda3\lib\site-packages\lesscpy\lessc\parser.py", line 157, in parse
    self.register.close()
  File "c:\users\java\a