# pandas/numpy 学习

In [3]:
import pandas as pd
import numpy as np

## numpy的基本类型为ndArray，可由list, tupple来创建

### ndArray的创建

In [62]:
np.array([0,2,4,8])

array([0, 2, 4, 8])

In [68]:
np.array((0, 2, 4, 8), dtype=float)

array([0., 2., 4., 8.])

In [66]:
np.array({'name': 'haha', 'sex': 'male'})

array({'name': 'haha', 'sex': 'male'}, dtype=object)

In [69]:
np.array([ (0, 2, 4, 8), [0,2,4,8] ])

array([[0, 2, 4, 8],
       [0, 2, 4, 8]])

In [71]:
np.array([ [0,2,4,8], {'name': 'haha', 'sex': 'male'} ])

array([list([0, 2, 4, 8]), {'name': 'haha', 'sex': 'male'}], dtype=object)

In [14]:
np.arange(1, 11)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

##### 下面函数中，Shape 参数，指数组的维度，比如 (2,3)表示2行3列

In [83]:
np.ones(10, int)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [107]:
np.zeros((2, 4), int)

array([[0, 0, 0, 0],
       [0, 0, 0, 0]])

In [109]:
np.full((2,3,4), 10, float)  # (shape, value, dtype, order)

array([[[10., 10., 10., 10.],
        [10., 10., 10., 10.],
        [10., 10., 10., 10.]],

       [[10., 10., 10., 10.],
        [10., 10., 10., 10.],
        [10., 10., 10., 10.]]])

In [106]:
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

### ndArray维度变换

In [140]:
test1 = np.arange(24).reshape((2,3,4))
test1

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [141]:
test1.reshape((4,3,2))  # 原数组不改变

array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]],

       [[12, 13],
        [14, 15],
        [16, 17]],

       [[18, 19],
        [20, 21],
        [22, 23]]])

In [142]:
test1.resize((4,3, 2))  # 原数组改变
test1

array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]],

       [[12, 13],
        [14, 15],
        [16, 17]],

       [[18, 19],
        [20, 21],
        [22, 23]]])

In [143]:
test1.swapaxes(1, 2)

array([[[ 0,  2,  4],
        [ 1,  3,  5]],

       [[ 6,  8, 10],
        [ 7,  9, 11]],

       [[12, 14, 16],
        [13, 15, 17]],

       [[18, 20, 22],
        [19, 21, 23]]])

In [144]:
test1.flatten()  # 降成一维数组

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

数组元素类型转换

In [145]:
test1.astype(np.float)

array([[[ 0.,  1.],
        [ 2.,  3.],
        [ 4.,  5.]],

       [[ 6.,  7.],
        [ 8.,  9.],
        [10., 11.]],

       [[12., 13.],
        [14., 15.],
        [16., 17.]],

       [[18., 19.],
        [20., 21.],
        [22., 23.]]])

数组转列表

In [146]:
test1.tolist()

[[[0, 1], [2, 3], [4, 5]],
 [[6, 7], [8, 9], [10, 11]],
 [[12, 13], [14, 15], [16, 17]],
 [[18, 19], [20, 21], [22, 23]]]

### numpy中CSV文件操作

In [176]:
np.arange(100).reshape(5, 20)

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
        56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
        76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
        96, 97, 98, 99]])

In [185]:
# 写入csv
stock = np.array([ ['open_price', 'close_price', 'turnover_rate'], [10.5, 11.5, 1.5], [11.5, 12, 1.3] ])
np.savetxt('price.csv', stock, fmt='%s', delimiter=',')

In [187]:
# 读取csv
np.loadtxt('price.csv', dtype=str, delimiter=',')

array([['open_price', 'close_price', 'turnover_rate'],
       ['10.5', '11.5', '1.5'],
       ['11.5', '12', '1.3']], dtype='<U13')

## 创建pandas DataFrame对象 

In [4]:
# 创建pandas DataFrame对象
dates = pd.date_range('20080101', periods = 5)  # 生成时间序列
array = np.arange(20).reshape(5,4)  # 生成10个元素数组，并转换成5行2列
df = pd.DataFrame(array, index=dates, columns=['open_price','close_price', 'highest_price', 'lowest_price'])
df

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-01,0,1,2,3
2008-01-02,4,5,6,7
2008-01-03,8,9,10,11
2008-01-04,12,13,14,15
2008-01-05,16,17,18,19


## 获取某列

In [18]:
df['open_price']

2008-01-01     0
2008-01-02     4
2008-01-03     8
2008-01-04    12
2008-01-05    16
Freq: D, Name: open_price, dtype: int64

In [19]:
df.open_price

2008-01-01     0
2008-01-02     4
2008-01-03     8
2008-01-04    12
2008-01-05    16
Freq: D, Name: open_price, dtype: int64

#### 获取多列

In [20]:
df.loc['2008-01-03':'2008-01-05', ]

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-03,8,9,10,11
2008-01-04,12,13,14,15
2008-01-05,16,17,18,19


In [41]:
df.loc['2008-01-03':'2008-01-05', ['open_price', 'highest_price'] ]

Unnamed: 0,open_price,highest_price
2008-01-03,8,10
2008-01-04,12,14
2008-01-05,16,18


In [53]:
df.loc[:, ['open_price', 'highest_price'] ]

Unnamed: 0,open_price,highest_price
2008-01-01,0,2
2008-01-02,4,6
2008-01-03,8,10
2008-01-04,12,14
2008-01-05,16,18


In [54]:
df.loc[:, 'open_price':'highest_price']

Unnamed: 0,open_price,close_price,highest_price
2008-01-01,0,1,2
2008-01-02,4,5,6
2008-01-03,8,9,10
2008-01-04,12,13,14
2008-01-05,16,17,18


In [55]:
df.iloc[2:6,]

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-03,8,9,10,11
2008-01-04,12,13,14,15
2008-01-05,16,17,18,19


In [59]:
df.iloc[1:4, 0:3]

Unnamed: 0,open_price,close_price,highest_price
2008-01-02,4,5,6
2008-01-03,8,9,10
2008-01-04,12,13,14


## 获取某行

In [19]:
df[2:3]  # 返回DataFrame类型

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-03,8,9,10,11


In [18]:
df['2008-01-03':'2008-01-03']

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-03,8,9,10,11


In [20]:
df.loc['2008-01-03']  # 返回Series类型

open_price        8
close_price       9
highest_price    10
lowest_price     11
Name: 2008-01-03 00:00:00, dtype: int64

In [21]:
df.iloc[:1]  # 等同于 df[:1]

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-01,0,1,2,3


#### 获取多行

In [22]:
df[:4]

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-01,0,1,2,3
2008-01-02,4,5,6,7
2008-01-03,8,9,10,11
2008-01-04,12,13,14,15


In [23]:
df.head(4)

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-01,0,1,2,3
2008-01-02,4,5,6,7
2008-01-03,8,9,10,11
2008-01-04,12,13,14,15


In [24]:
df['2008-01-02':'2008-01-04']

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-02,4,5,6,7
2008-01-03,8,9,10,11
2008-01-04,12,13,14,15


## 获取某一元素

In [25]:
df['open_price']['2008-01-03']  # 注意先列后行

8

#### 获取元素个数

In [26]:
df.size

20

## 条件过滤

In [27]:
df[(df.index>'2008-01-01') & (df.index<'2008-01-05') & (df.close_price>8)]

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-03,8,9,10,11
2008-01-04,12,13,14,15


## 添加一行

## 添加一列

In [244]:
df.index

DatetimeIndex(['2008-01-01', '2008-01-02', '2008-01-03', '2008-01-04',
               '2008-01-05'],
              dtype='datetime64[ns]', freq='D')

In [22]:
new_col = pd.Series([1.5, 2.5, 3.5, 4.5, 5.5], index=df.index, dtype=str)
new_col

2008-01-01    1.5
2008-01-02    2.5
2008-01-03    3.5
2008-01-04    4.5
2008-01-05    5.5
Freq: D, dtype: object

In [23]:
df['turnover_rate'] = new_col
df

Unnamed: 0,open_price,close_price,turnover_rate
2008-01-01,0,1,1.5
2008-01-02,2,3,2.5
2008-01-03,4,5,3.5
2008-01-04,6,7,4.5
2008-01-05,8,9,5.5


## 删除一行

In [245]:
df.axes

[DatetimeIndex(['2008-01-01', '2008-01-02', '2008-01-03', '2008-01-04',
                '2008-01-05'],
               dtype='datetime64[ns]', freq='D'),
 Index(['open_price', 'close_price', 'turnover_rate'], dtype='object')]

In [5]:
df.drop(np.datetime64('2008-01-04'))  # 行为 Datetime 类型的index

Unnamed: 0,open_price,close_price,highest_price,lowest_price
2008-01-01,0,1,2,3
2008-01-02,4,5,6,7
2008-01-03,8,9,10,11
2008-01-05,16,17,18,19


In [28]:
del df['2008-01-04':'2008-01-04']

TypeError: 'slice('2008-01-04', '2008-01-04', None)' is an invalid key

In [29]:
del df[3:4]

TypeError: 'slice(3, 4, None)' is an invalid key

## 删除一列

In [247]:
df.columns

Index(['open_price', 'close_price', 'turnover_rate'], dtype='object')

In [248]:
df.drop('turnover_rate', axis=1)

Unnamed: 0,open_price,close_price
2008-01-01,0,1
2008-01-02,2,3
2008-01-03,4,5
2008-01-04,6,7
2008-01-05,8,9


In [250]:
df.drop(columns='turnover_rate')

Unnamed: 0,open_price,close_price
2008-01-01,0,1
2008-01-02,2,3
2008-01-03,4,5
2008-01-04,6,7
2008-01-05,8,9


## dataFrame的运算

##### 获取行列的长度，其实就是索引的长度

In [44]:
df.shape

(5, 3)