In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 建立物件

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20180316',periods=6)
dates

DatetimeIndex(['2018-03-16', '2018-03-17', '2018-03-18', '2018-03-19',
               '2018-03-20', '2018-03-21'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2018-03-16,0.819897,-0.219285,-1.973833,-1.102269
2018-03-17,0.573745,-0.564269,-0.557552,0.450222
2018-03-18,0.895687,-0.193416,0.207576,-0.386827
2018-03-19,-0.913398,0.55534,-1.970991,-0.802203
2018-03-20,0.075461,-0.029017,1.720056,-0.107856
2018-03-21,0.934498,-0.508168,1.138692,1.457972


In [6]:
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20180316'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*4, dtype='int32'),
    'E': pd.Categorical(["test","train", "test", "train"]),
    'F': 'foo'
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-03-16,1.0,3,test,foo
1,1.0,2018-03-16,1.0,3,train,foo
2,1.0,2018-03-16,1.0,3,test,foo
3,1.0,2018-03-16,1.0,3,train,foo


## 檢視資料

## 選取資料

In [7]:
df['A'] #選取行

2018-03-16    0.819897
2018-03-17    0.573745
2018-03-18    0.895687
2018-03-19   -0.913398
2018-03-20    0.075461
2018-03-21    0.934498
Freq: D, Name: A, dtype: float64

In [8]:
df[0:3] #選取列

Unnamed: 0,A,B,C,D
2018-03-16,0.819897,-0.219285,-1.973833,-1.102269
2018-03-17,0.573745,-0.564269,-0.557552,0.450222
2018-03-18,0.895687,-0.193416,0.207576,-0.386827


In [12]:
df['20180317':'20180320']

Unnamed: 0,A,B,C,D
2018-03-17,0.573745,-0.564269,-0.557552,0.450222
2018-03-18,0.895687,-0.193416,0.207576,-0.386827
2018-03-19,-0.913398,0.55534,-1.970991,-0.802203
2018-03-20,0.075461,-0.029017,1.720056,-0.107856


### 以標籤選取資料

In [13]:
df.loc[dates[1]]

A    0.573745
B   -0.564269
C   -0.557552
D    0.450222
Name: 2018-03-17 00:00:00, dtype: float64

In [14]:
df.loc[:, ['A','B']]

Unnamed: 0,A,B
2018-03-16,0.819897,-0.219285
2018-03-17,0.573745,-0.564269
2018-03-18,0.895687,-0.193416
2018-03-19,-0.913398,0.55534
2018-03-20,0.075461,-0.029017
2018-03-21,0.934498,-0.508168


In [15]:
df.loc['20180319':'20180322', ['A','B']]

Unnamed: 0,A,B
2018-03-19,-0.913398,0.55534
2018-03-20,0.075461,-0.029017
2018-03-21,0.934498,-0.508168


In [16]:
df.loc[dates[2], 'B']

-0.19341612752482215

In [17]:
df.at[dates[2],'B']

-0.19341612752482215

### 以位置選取資料

In [18]:
df.iloc[3]

A   -0.913398
B    0.555340
C   -1.970991
D   -0.802203
Name: 2018-03-19 00:00:00, dtype: float64

In [19]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-03-19,-0.913398,0.55534
2018-03-20,0.075461,-0.029017


In [21]:
df.iloc[[1,2,4],[0,2,3]]

Unnamed: 0,A,C,D
2018-03-17,0.573745,-0.557552,0.450222
2018-03-18,0.895687,0.207576,-0.386827
2018-03-20,0.075461,1.720056,-0.107856


In [22]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2018-03-17,0.573745,-0.564269,-0.557552,0.450222
2018-03-18,0.895687,-0.193416,0.207576,-0.386827


In [23]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2018-03-16,-0.219285,-1.973833
2018-03-17,-0.564269,-0.557552
2018-03-18,-0.193416,0.207576
2018-03-19,0.55534,-1.970991
2018-03-20,-0.029017,1.720056
2018-03-21,-0.508168,1.138692


In [24]:
df.iloc[1,1]

-0.56426934127961548

In [25]:
df.iat[1,1]

-0.56426934127961548

### 以邏輯值索引

In [26]:
df[df.A>0]

Unnamed: 0,A,B,C,D
2018-03-16,0.819897,-0.219285,-1.973833,-1.102269
2018-03-17,0.573745,-0.564269,-0.557552,0.450222
2018-03-18,0.895687,-0.193416,0.207576,-0.386827
2018-03-20,0.075461,-0.029017,1.720056,-0.107856
2018-03-21,0.934498,-0.508168,1.138692,1.457972


In [27]:
df[df>0]

Unnamed: 0,A,B,C,D
2018-03-16,0.819897,,,
2018-03-17,0.573745,,,0.450222
2018-03-18,0.895687,,0.207576,
2018-03-19,,0.55534,,
2018-03-20,0.075461,,1.720056,
2018-03-21,0.934498,,1.138692,1.457972


In [28]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2018-03-16,0.819897,-0.219285,-1.973833,-1.102269,one
2018-03-17,0.573745,-0.564269,-0.557552,0.450222,one
2018-03-18,0.895687,-0.193416,0.207576,-0.386827,two
2018-03-19,-0.913398,0.55534,-1.970991,-0.802203,three
2018-03-20,0.075461,-0.029017,1.720056,-0.107856,four
2018-03-21,0.934498,-0.508168,1.138692,1.457972,three


In [29]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2018-03-18,0.895687,-0.193416,0.207576,-0.386827,two
2018-03-20,0.075461,-0.029017,1.720056,-0.107856,four


### 資料內容設定

In [30]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20180327', periods=6))
s1

2018-03-27    1
2018-03-28    2
2018-03-29    3
2018-03-30    4
2018-03-31    5
2018-04-01    6
Freq: D, dtype: int64

In [31]:
df.at[dates[1], 'A'] = 0

In [33]:
df.iat[0,1] = 0

In [34]:
df.loc[:,'D'] = np.array([5]*len(df))

In [35]:
df

Unnamed: 0,A,B,C,D
2018-03-16,0.819897,0.0,-1.973833,5
2018-03-17,0.0,-0.564269,-0.557552,5
2018-03-18,0.895687,-0.193416,0.207576,5
2018-03-19,-0.913398,0.55534,-1.970991,5
2018-03-20,0.075461,-0.029017,1.720056,5
2018-03-21,0.934498,-0.508168,1.138692,5


In [36]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D
2018-03-16,-0.819897,0.0,-1.973833,-5
2018-03-17,0.0,-0.564269,-0.557552,-5
2018-03-18,-0.895687,-0.193416,-0.207576,-5
2018-03-19,-0.913398,-0.55534,-1.970991,-5
2018-03-20,-0.075461,-0.029017,-1.720056,-5
2018-03-21,-0.934498,-0.508168,-1.138692,-5


### 遺漏值

In [38]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])

In [39]:
df1.loc[dates[0]:dates[1], 'E']=1

In [40]:
df1

Unnamed: 0,A,B,C,D,E
2018-03-16,0.819897,0.0,-1.973833,5,1.0
2018-03-17,0.0,-0.564269,-0.557552,5,1.0
2018-03-18,0.895687,-0.193416,0.207576,5,
2018-03-19,-0.913398,0.55534,-1.970991,5,


In [41]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2018-03-16,0.819897,0.0,-1.973833,5,1.0
2018-03-17,0.0,-0.564269,-0.557552,5,1.0


In [42]:
df1.fillna(value=6)

Unnamed: 0,A,B,C,D,E
2018-03-16,0.819897,0.0,-1.973833,5,1.0
2018-03-17,0.0,-0.564269,-0.557552,5,1.0
2018-03-18,0.895687,-0.193416,0.207576,5,6.0
2018-03-19,-0.913398,0.55534,-1.970991,5,6.0


In [43]:
pd.isna(df1)

AttributeError: module 'pandas' has no attribute 'isna'

### 資料操作

* 統計操作

In [44]:
df

Unnamed: 0,A,B,C,D
2018-03-16,0.819897,0.0,-1.973833,5
2018-03-17,0.0,-0.564269,-0.557552,5
2018-03-18,0.895687,-0.193416,0.207576,5
2018-03-19,-0.913398,0.55534,-1.970991,5
2018-03-20,0.075461,-0.029017,1.720056,5
2018-03-21,0.934498,-0.508168,1.138692,5


In [45]:
df.mean()

A    0.302024
B   -0.123255
C   -0.239342
D    5.000000
dtype: float64

In [46]:
df.mean(1)

2018-03-16    0.961516
2018-03-17    0.969545
2018-03-18    1.477462
2018-03-19    0.667738
2018-03-20    1.691625
2018-03-21    1.641255
Freq: D, dtype: float64

In [48]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2018-03-16    NaN
2018-03-17    NaN
2018-03-18    1.0
2018-03-19    3.0
2018-03-20    5.0
2018-03-21    NaN
Freq: D, dtype: float64

In [49]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2018-03-16,,,,
2018-03-17,,,,
2018-03-18,-0.104313,-1.193416,-0.792424,4.0
2018-03-19,-3.913398,-2.44466,-4.970991,2.0
2018-03-20,-4.924539,-5.029017,-3.279944,0.0
2018-03-21,,,,


### Apply

In [50]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2018-03-16,0.819897,0.0,-1.973833,5
2018-03-17,0.819897,-0.564269,-2.531385,10
2018-03-18,1.715584,-0.757685,-2.323809,15
2018-03-19,0.802186,-0.202345,-4.2948,20
2018-03-20,0.877647,-0.231363,-2.574744,25
2018-03-21,1.812145,-0.739531,-1.436052,30


In [51]:
df.apply(lambda x: x.max() - x.min())

A    1.847896
B    1.119609
C    3.693889
D    0.000000
dtype: float64

### Histogramming

In [52]:
s = pd.Series(np.random.randint(0,7, size=10))
s

0    0
1    6
2    0
3    2
4    6
5    4
6    2
7    1
8    3
9    0
dtype: int64

In [54]:
s.value_counts()

0    3
6    2
2    2
4    1
3    1
1    1
dtype: int64

### 字串操作

In [57]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

### 合併

In [58]:
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-1.752928,0.342852,1.207582,-1.115408
1,-0.656684,0.52522,-0.445097,0.259065
2,-0.324382,-0.338013,-2.622972,0.508987
3,-1.238847,-0.949052,-0.047481,0.218864
4,2.092769,-1.671446,1.261836,-0.695452
5,-2.015402,0.041071,1.482395,-2.29754
6,-0.366034,0.013939,-0.093535,0.857304
7,-1.714036,-0.929991,-0.039159,-0.461276
8,-1.48912,-0.684111,0.066006,-0.113061
9,0.781438,1.309366,0.342542,-0.177199


In [61]:
piece = [df[0:3], df[2:5], df[7:]]

In [62]:
pd.concat(piece)

Unnamed: 0,0,1,2,3
0,-1.752928,0.342852,1.207582,-1.115408
1,-0.656684,0.52522,-0.445097,0.259065
2,-0.324382,-0.338013,-2.622972,0.508987
2,-0.324382,-0.338013,-2.622972,0.508987
3,-1.238847,-0.949052,-0.047481,0.218864
4,2.092769,-1.671446,1.261836,-0.695452
7,-1.714036,-0.929991,-0.039159,-0.461276
8,-1.48912,-0.684111,0.066006,-0.113061
9,0.781438,1.309366,0.342542,-0.177199


### 結合

In [65]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval':[4,5]})

In [66]:
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [67]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [68]:
pd.merge(left, right, on = 'key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5
