# Numpy

## Numpy 基本属性

In [100]:
import numpy as np

array = np.array([[[1,2,3], # 定义数组
                 [2,3,4]]])
print(array)
print('dimension:', array.ndim) # 维数 
print('shape:', array.shape)
print('size:', array.size)
print('dtype:', array.dtype) # 默认整形

[[[1 2 3]
  [2 3 4]]]
dimension: 3
shape: (1, 2, 3)
size: 6
dtype: int32


## Numpy 创建 array

### 使用列表创建 向量 & 矩阵

In [25]:
import numpy as np

a = np.array([2,3,4], dtype=np.int) 
print(a) # 输出无逗号
print(a.dtype)

[2 3 4]
int32


In [18]:
matrix = np.array([[1,2,3],
             [2,3,4]])
print(matrix)

[[1 2 3]
 [2 3 4]]


### 创建 0，1，空 矩阵

In [23]:
zeros = np.zeros((3,4)) # shape use()
print(zeros) # default dtype = float64

ones = np.ones((3,4), dtype=int)
print(ones)

empty = np.empty((3,4))
print(empty)


[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


### 创建 有序矩阵

In [33]:
import numpy as np

a = np.arange(12).reshape((3,4))
print(a)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [35]:
a = np.arange(20,50,4) # [20,50) step=4
print(a)

[20 24 28 32 36 40 44 48]


In [32]:
a = np.linspace(1,10,20).reshape((2,2,5)) # [1,10) slip as 20 pieces
print(a)

[[[ 1.          1.47368421  1.94736842  2.42105263  2.89473684]
  [ 3.36842105  3.84210526  4.31578947  4.78947368  5.26315789]]

 [[ 5.73684211  6.21052632  6.68421053  7.15789474  7.63157895]
  [ 8.10526316  8.57894737  9.05263158  9.52631579 10.        ]]]


<div class="mark">
### 创建随机矩阵</div><i class="fa fa-lightbulb-o "></i>

In [54]:
a = np.random.random((3,4)) # [0,1) random number
print(a)

[[0.38977955 0.26217317 0.13385397 0.65687882]
 [0.60388027 0.13614177 0.89659863 0.94698431]
 [0.40266388 0.31268458 0.02294242 0.08113657]]


## Numpy 基础运算

### 算数运算

In [50]:
import numpy as np

a = np.array([10,20,30,40])
b = np.arange(4)

print(a)
print(b)

[10 20 30 40]
[0 1 2 3]


In [46]:
c = a-b # must be the same shape, then each one do it
print(c)

[10 19 28 37]


In [47]:
c = b**2 # square
print(c)

[0 1 4 9]


In [48]:
c = 10*np.sin(a) # Arc system
print(c)

[-5.44021111  9.12945251 -9.88031624  7.4511316 ]


In [49]:
c = b<3
print(c)

[ True  True  True False]


### 矩阵运算

In [52]:
import numpy as np

a = np.array([10,20,30,40,50,60]).reshape(3,2)
b = np.arange(10).reshape(2,5)

print(a)
print(b)

[[10 20]
 [30 40]
 [50 60]]
[[0 1 2 3 4]
 [5 6 7 8 9]]


In [53]:
c = np.dot(a,b) # first's column == second's row
print(c) # result first's row * second's column

[[100 130 160 190 220]
 [200 270 340 410 480]
 [300 410 520 630 740]]


### sum min max mean median

1. astype(int) 改变数据类型
2. axis=0 对于列计算 垂直操作；axis=1 对于行计算 水平操作

In [89]:
import numpy as np

a = (10*np.sin(np.random.random((3,4)))).astype(int) 

print(a)
print('sum:', np.sum(a))

[[6 8 6 7]
 [5 3 6 7]
 [5 1 6 1]]
sum: 61


In [90]:
print('sum in column:', np.sum(a, axis=0))
print('sum in row:', np.sum(a, axis=1))

sum in column: [16 12 18 15]
sum in row: [27 21 13]


In [27]:
print('min:', np.min(a))
print('max:', np.max(a))

min: 1
max: 8


In [32]:
print('mean:',np.mean(a)) # 平均值
print('median:',np.median(a)) # 中位数

mean: 5.25
median: 5.0


### cumsum diff clip

In [40]:
print('cumsum:',np.cumsum(a)) # 累加数列

cumsum: [ 5  8 15 20 28 33 41 45 48 49 55 63]


In [39]:
print('diff:\n',np.diff(a)) # 邻差数列（后-前）

diff:
 [[-2  4 -2]
 [-3  3 -4]
 [-2  5  2]]


In [48]:
print('clip 5-9 :\n',np.clip(a,5,9)) # 门槛同化

clip 5-9 :
 [[5 5 7 5]
 [8 5 8 5]
 [5 5 6 8]]


### sort transpose

In [45]:
print('sort:\n',np.sort(a)) # 排序(逐行升序)

sort:
 [[3 5 5 7]
 [4 5 8 8]
 [1 3 6 8]]


In [46]:
print('transpose:\n',np.transpose(a)) # 转置

transpose:
 [[5 8 3]
 [3 5 1]
 [7 8 6]
 [5 4 8]]


### 指定数索引

In [58]:
import numpy as np

a = (10*np.sin(np.random.random((3,4)))).astype(int)
print(a)
print('a[1][2]:',a[1][2]) # 索引从0开始
print('a[1]:',a[1,:]) # 第2行所有数 ：表示所有

[[8 5 4 6]
 [7 4 0 2]
 [1 4 4 3]]
a[1][2]: 0
a[1]: [7 4 0 2]
a[1]: [7 4 0 2]


In [56]:
print('argmin:',np.argmin(a))
print('argmax:',np.argmax(a))

argmin: 5
argmax: 0


### 迭代行，列，所有元素

In [62]:
import numpy as np

a = (10*np.sin(np.random.random((3,4)))).astype(int)
for row in a: # 迭代行
    print(row)
    
for column in a.T: # 转置即迭代列
    print(column)
    
for item in a.flat: # 把a拍平
    print(item,end=' ')

[8 5 4 6]
[7 4 0 2]
[1 4 4 3]
[8 7 1]
[5 4 4]
[4 0 4]
[6 2 3]
8 5 4 6 7 4 0 2 1 4 4 3 

## 合并，拆分ndarray

In [83]:
import numpy as np

a = np.array([1,1,1])
b = np.array([2,2,2])

print('vastack:\n',np.vstack((a,b))) # 垂直合并 vertical
print('hastack:\n',np.hstack((a,b))) # 水平合并 horizontal

vastack:
 [[1 1 1]
 [2 2 2]]
hastack:
 [1 1 1 2 2 2]


In [97]:
import numpy as np

a = np.arange(12).reshape(3,4)

print(np.split(a,3,axis=0)) # 垂直分割 （分割线垂直移动）
print(np.vsplit(a,3))

print(np.split(a,4,axis=1)) # 水平分割
print(np.hsplit(a,4))

print(np.array_split(a,3,axis=1)) # 不等量分割 （按列求和）

[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
[array([[0],
       [4],
       [8]]), array([[1],
       [5],
       [9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]
[array([[0],
       [4],
       [8]]), array([[1],
       [5],
       [9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]


## Numpy 复制

1. python面向对象，赋值->同对象，变化关联
2. 复制->只赋值不关联对象

In [101]:
import numpy as np

a = np.arange(4)
b = a.copy()
b[2] = 1

print(a)

[0 1 2 3]


# Pandas

## Pandas 基本介绍

### Series

In [120]:
import pandas as pd
import numpy as np

s = pd.Series([1,3,6,np.nan,44,1]) # pandas序列 类似字典
print(s)

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64


## DataFrame

### 生成方式

In [121]:
dates = pd.date_range('20190623',periods=6) # 日期
print(dates)

DatetimeIndex(['2019-06-23', '2019-06-24', '2019-06-25', '2019-06-26',
               '2019-06-27', '2019-06-28'],
              dtype='datetime64[ns]', freq='D')


In [142]:
df = pd.DataFrame(np.arange(24).reshape((6,4)))
print(df)

    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
5  20  21  22  23


In [144]:
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) 
# 默认序列0开始                        行索引      列索引
print(df)

             a   b   c   d
2019-06-23   0   1   2   3
2019-06-24   4   5   6   7
2019-06-25   8   9  10  11
2019-06-26  12  13  14  15
2019-06-27  16  17  18  19
2019-06-28  20  21  22  23


### 数据分析

In [145]:
print(df.dtypes) # 数据类型
print(df.values) # 数据

print(df.index) # 行名
print(df.columns) # 列名

a    int32
b    int32
c    int32
d    int32
dtype: object
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]
DatetimeIndex(['2019-06-23', '2019-06-24', '2019-06-25', '2019-06-26',
               '2019-06-27', '2019-06-28'],
              dtype='datetime64[ns]', freq='D')
Index(['a', 'b', 'c', 'd'], dtype='object')


In [146]:
print(df.describe()) # 数据按列分析

               a          b          c          d
count   6.000000   6.000000   6.000000   6.000000
mean   10.000000  11.000000  12.000000  13.000000
std     7.483315   7.483315   7.483315   7.483315
min     0.000000   1.000000   2.000000   3.000000
25%     5.000000   6.000000   7.000000   8.000000
50%    10.000000  11.000000  12.000000  13.000000
75%    15.000000  16.000000  17.000000  18.000000
max    20.000000  21.000000  22.000000  23.000000


### 排序

In [147]:
print(df.sort_index(axis=0,ascending=False)) # 按行名降序

             a   b   c   d
2019-06-28  20  21  22  23
2019-06-27  16  17  18  19
2019-06-26  12  13  14  15
2019-06-25   8   9  10  11
2019-06-24   4   5   6   7
2019-06-23   0   1   2   3


In [148]:
print(df.sort_index(axis=1,ascending=False)) # 按列名降序

             d   c   b   a
2019-06-23   3   2   1   0
2019-06-24   7   6   5   4
2019-06-25  11  10   9   8
2019-06-26  15  14  13  12
2019-06-27  19  18  17  16
2019-06-28  23  22  21  20


In [150]:
print(df.sort_values(by='c',ascending=False)) # 按'c'列降序

             a   b   c   d
2019-06-28  20  21  22  23
2019-06-27  16  17  18  19
2019-06-26  12  13  14  15
2019-06-25   8   9  10  11
2019-06-24   4   5   6   7
2019-06-23   0   1   2   3


### 选择数据

In [164]:
print(df['a']) # 选择'a'列
print(df[1:4]) # 按行切片
print(df[df.a>8]) # 布尔提取

2019-06-23     0
2019-06-24     4
2019-06-25     8
2019-06-26    12
2019-06-27    16
2019-06-28    20
Freq: D, Name: a, dtype: int32
             a   b   c   d
2019-06-24   4   5   6   7
2019-06-25   8   9  10  11
2019-06-26  12  13  14  15
             a   b   c   d
2019-06-26  12  13  14  15
2019-06-27  16  17  18  19
2019-06-28  20  21  22  23


In [160]:
print(df.loc['2019-06-26']) # 按标签索引
print(df.loc[:,'a'])
print(df.loc['2019-06-26',['b','c']])

a    12
b    13
c    14
d    15
Name: 2019-06-26 00:00:00, dtype: int32
2019-06-23     0
2019-06-24     4
2019-06-25     8
2019-06-26    12
2019-06-27    16
2019-06-28    20
Freq: D, Name: a, dtype: int32
b    13
c    14
Name: 2019-06-26 00:00:00, dtype: int32


In [161]:
print(df.iloc[3,1]) # 按位置索引提取

13


### 重新赋值 & 添加数据

In [167]:
df.iloc[2,2]=66
print(df)

             a   b   c   d
2019-06-23   0   1   2   3
2019-06-24   4   5   6   7
2019-06-25   8   9  66  11
2019-06-26  12  13  14  15
2019-06-27  16  17  18  19
2019-06-28  20  21  22  23


In [168]:
df.a[df.a>8]=0
print(df)

            a   b   c   d
2019-06-23  0   1   2   3
2019-06-24  4   5   6   7
2019-06-25  8   9  66  11
2019-06-26  0  13  14  15
2019-06-27  0  17  18  19
2019-06-28  0  21  22  23


In [176]:
df['e'] = np.arange(6)
df['f'] = pd.Series([1,3,6,np.nan,44,1], index = pd.date_range('20190624',periods=6)) 
# 添加的序列行名不一致则为 NaN
print(df)

            a   b   c   d  e     f
2019-06-23  0   1   2   3  0   NaN
2019-06-24  4   5   6   7  1   1.0
2019-06-25  8   9  66  11  2   3.0
2019-06-26  0  13  14  15  3   6.0
2019-06-27  0  17  18  19  4   NaN
2019-06-28  0  21  22  23  5  44.0


### 数据丢失

In [179]:
print(df.dropna(axis=0)) # 垂直操作 有任何一行含NAN 则丢弃
print(df.dropna(axis=0, how='all')) # 垂直操作 有任何一行全为NAN 则丢弃

            a   b   c   d  e     f
2019-06-24  4   5   6   7  1   1.0
2019-06-25  8   9  66  11  2   3.0
2019-06-26  0  13  14  15  3   6.0
2019-06-28  0  21  22  23  5  44.0
            a   b   c   d  e     f
2019-06-23  0   1   2   3  0   NaN
2019-06-24  4   5   6   7  1   1.0
2019-06-25  8   9  66  11  2   3.0
2019-06-26  0  13  14  15  3   6.0
2019-06-27  0  17  18  19  4   NaN
2019-06-28  0  21  22  23  5  44.0


In [180]:
print(df.fillna(value=0)) # NAN -> 0

            a   b   c   d  e     f
2019-06-23  0   1   2   3  0   0.0
2019-06-24  4   5   6   7  1   1.0
2019-06-25  8   9  66  11  2   3.0
2019-06-26  0  13  14  15  3   6.0
2019-06-27  0  17  18  19  4   0.0
2019-06-28  0  21  22  23  5  44.0


In [183]:
print(df.isnull()) 
print(np.any(df.isnull())==True) # 检查是否丢失数据

                a      b      c      d      e      f
2019-06-23  False  False  False  False  False   True
2019-06-24  False  False  False  False  False  False
2019-06-25  False  False  False  False  False  False
2019-06-26  False  False  False  False  False  False
2019-06-27  False  False  False  False  False   True
2019-06-28  False  False  False  False  False  False
True


## Pandas 导入导出

In [188]:
import pandas as pd
import numpy as np

data = pd.read_csv('files/student.csv') # 自动加行索引
print(data)

data.to_pickle('files/student.pickle')

    Student ID   name  age  gender
0         1100  Kelly   22  Female
1         1101    Clo   21  Female
2         1102  Tilly   23  Female
3         1103   Tony   24    Male
4         1104  David   25    Male
5         1105   Jack   22    Male
6         1106  Catty   23  Female
7         1107  Milly   22  Female
8         1108   Jone   22    Male
9         1109  Jason   19    Male
10        1110    Tom   25    Male
11        1111  Lilly   25  Female


## Pandas 合并

### concat

In [209]:
import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) # 同列索引同数据
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])

res = pd.concat([df1,df2,df3], axis=0, ignore_index=True) # 垂直合并 行索引重新排序
print(res)

     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0


### join

In [210]:
import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d']) # 不同索引
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])

res = pd.concat([df1,df2], join='outer', ignore_index=True) # 取并集 无则NaN
print(res)

res = pd.concat([df1,df2], join='inner', ignore_index=True) # 取交集 无则减去
print(res)

     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
5  NaN  1.0  1.0  1.0  1.0
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


### join_axes

In [211]:
res = pd.concat([df1,df2],axis=1) # 按df1 的行索引合并，无则NAN
print(res)

res = pd.concat([df1,df2],axis=1, join_axes=[df1.index]) # 按df1 的行索引合并，无则NAN
print(res)

     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0


### append

In [213]:
res = df1.append(df2, ignore_index=True)
print(res)

     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
5  NaN  1.0  1.0  1.0  1.0


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
