# pandas概要
一种简洁高效的数据分析工具具备对一维或者多维数据进行索引、读取、整理、组合、数据分片等能力。
毫无疑问，pandas是python程序员的强大工具：
+ 具有相同数据类型数据集的高性能数组以及数据结构，比如Series。具有不同数据类型的数据集的数据结构，比如DataFrame对象
+ 对表数据中的行和列进行插入、删除操作
+ 基于层次的多维索引、层次索引
+ 对缺失数据处理：定位、插值
+ 强大的分组统计功能
+ 数据分片和过滤
+ 类似sql的merge和join功能
+ 时间序列数据的处理（包括 data range generation, moving window statistic, time shifting, lagging）
+ 读取和存储excel、csv、txt、数据库-mango-mysql等、hdfs5、json等格式文件
+ 读取web文件，以及各种内置数据源包括Yahoo、google finance、world bank

pandas与Scipy、Numpy、scikit-learn、matplotlib共同构成了数据科学工具栈。




# Numpy概要
Numpy是一个开源的python扩充程序库，主要用于科学计算的。它主要为python提供了高性能数组与矩阵运算处理能力。Numpy为python提供了真正的多维数组处理能力，内置了非常丰富的函数进行支持向量化方式的运算，这些函数能够直接对数组运算，同时那些需要在python上通过循环来做的运算，放到C语言环境中进行执行，性能极大提升。
pandas大量运用了Numpy的数组来实现Series和DataFrame对象。Numpy同时也支持分片（slice）和向量化操作。因此，先了解Numpy中的数组和一些函数：
+ 任意维数的数组（ndarray， n-dimensional array object）
+ 通用函数对象（ufunc、universal function object）

## 导入numpy库

In [2]:
import numpy as np

## python中的List与numpy中的数组在遍历性能上根本不在一个数量级上

In [7]:
def squares_inputs(inputs):
    result = []
    for i in inputs:
        result.append(i * i)
    return result
s_inputs = range(10000)
%timeit squares_inputs(s_inputs)

1000 loops, best of 3: 1.34 ms per loop


In [5]:
s_inputs

range(0, 10000)

In [8]:
array_to_squares = np.arange(0,10000)
%timeit array_to_squares **2

The slowest run took 1764.54 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 8.74 µs per loop


## np.array()与np.arange()的用法

In [6]:
a = np.arange(5)
a.dtype


dtype('int32')

In [7]:
np.array?  #别忘记？的用法

In [8]:
b = np.array([1, 2, 0.3])

In [9]:
b

array([ 1. ,  2. ,  0.3])

In [10]:
len(b)

3

In [11]:
type(b)

numpy.ndarray

In [12]:
b.ndim

1

In [13]:
b.shape

(3,)

In [14]:
np.shape?

In [15]:
np.shape(np.eye(3))

(3, 3)

In [16]:
np.eye(3)

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [17]:
np.eye(10)

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [18]:
np.shape([1,12])

(2,)

In [19]:
np.shape([[1,12]])

(1, 2)

In [20]:
a = [2,3,0.5]
b = [[2,3,0.5]]

In [51]:
a = np.array([2,3,0.5])  #创建一个1维的数组，元素是64位浮点数
b = np.array([[2,3,0.5]])  #创建一个1*3矩阵
c = np.array([[1,2,3.5,2,3.8],  #创建一个3*5矩阵
              [3,4,6.1,5,7],
              [0,3,4.3,9,8]])

In [57]:
print("a is:",type(a))
print(a)
print("a.dtype=",a.dtype)  #查看数组或者矩阵中的元素的类别
print("a.ndim=",a.ndim)  #查看数组或者矩阵的秩
print("a.shape=",a.shape)  #查看数组或者矩阵的维度
print("a.size=",a.size)  #查看数组或者矩阵的大小
print("")
print("b is:",type(b))
print(b)
print("b.dtype=",b.dtype)  
print("b.ndim=",b.ndim)
print("b.shape=",b.shape)
print("b.size=",b.size)
print("")
print("c is:",type(c))
print(c)
print("c.dtype=",c.dtype)  
print("c.ndim=",c.ndim)
print("c.shape=",c.shape)
print("c.size=",c.size)

a is: <class 'numpy.ndarray'>
[ 2.   3.   0.5]
a.dtype= float64
a.ndim= 1
a.shape= (3,)
a.size= 3

b is: <class 'numpy.ndarray'>
[[ 2.   3.   0.5]]
b.dtype= float64
b.ndim= 2
b.shape= (1, 3)
b.size= 3

c is: <class 'numpy.ndarray'>
[[ 1.   2.   3.5  2.   3.8]
 [ 3.   4.   6.1  5.   7. ]
 [ 0.   3.   4.3  9.   8. ]]
c.dtype= float64
c.ndim= 2
c.shape= (3, 5)
c.size= 15


In [26]:
np.arange?

In [47]:
np.array?

In [33]:
np.ndim?

In [48]:
np.array([1,2,3])

array([1, 2, 3])

In [55]:
np.array([[1,2,3],[3,2,1]])

array([[1, 2, 3],
       [3, 2, 1]])

In [58]:
np.array([1,2,3],ndmin=2)

array([[1, 2, 3]])

In [59]:
np.array([1,2,3],ndmin=1)

array([1, 2, 3])

In [60]:
np.array([1,2,3],ndmin=3)

array([[[1, 2, 3]]])

In [62]:
m = np.array([np.arange(3),np.arange(3)])

In [63]:
m

array([[0, 1, 2],
       [0, 1, 2]])

In [65]:
m[1,1]

1

In [69]:
m[1,0]

0

## 连续函数的用法

In [70]:
x = np.arange(12).reshape(2,6) 

In [71]:
x

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])

In [72]:
print(x)

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]]


In [73]:
x = np.arange(12).reshape(4,3)

In [74]:
print(x)

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]


## 自定义数据类型的ndarray：

In [83]:
t = np.dtype([("name",np.str_,100),("numitems",np.int32),("price",np.float32)])  #定义一个新的数据类型t，用于保存name、numitems和price三个字段

In [84]:
t

dtype([('name', '<U100'), ('numitems', '<i4'), ('price', '<f4')])

In [86]:
shopping_list = np.array([("bread", 50, 1.5),("water", 102,1),("food",39,5.6)],dtype = t)

In [87]:
shopping_list

array([('bread', 50, 1.5), ('water', 102, 1.0),
       ('food', 39, 5.599999904632568)], 
      dtype=[('name', '<U100'), ('numitems', '<i4'), ('price', '<f4')])

In [88]:
print(shopping_list)

[('bread', 50, 1.5) ('water', 102, 1.0) ('food', 39, 5.599999904632568)]


In [94]:
shopping_list[0]

('bread', 50, 1.5)

In [95]:
for i in shopping_list:
    print(i)

('bread', 50, 1.5)
('water', 102, 1.0)
('food', 39, 5.599999904632568)


In [109]:
for i in [0,1,2]:
    print(shopping_list[i])

('bread', 50, 1.5)
('water', 102, 1.0)
('food', 39, 5.599999904632568)


In [101]:
len(shopping_list)-1

2

In [113]:
print(shopping_list[0:3])

[('bread', 50, 1.5) ('water', 102, 1.0) ('food', 39, 5.599999904632568)]


In [89]:
shopping_list2 = np.array([[("bread", 50, 1.5),("water", 102,1),("food",39,5.6)],
                          [("bread", 21, 1.5),("water", 14,1),("food",20,5.6)],
                          [("bread", 50, 1.5),("water", 102,1),("food",39,5.6)]])

In [90]:
print(shopping_list2)

[[['bread' '50' '1.5']
  ['water' '102' '1']
  ['food' '39' '5.6']]

 [['bread' '21' '1.5']
  ['water' '14' '1']
  ['food' '20' '5.6']]

 [['bread' '50' '1.5']
  ['water' '102' '1']
  ['food' '39' '5.6']]]


In [114]:
for o in shopping_list2:
    print(o)

[['bread' '50' '1.5']
 ['water' '102' '1']
 ['food' '39' '5.6']]
[['bread' '21' '1.5']
 ['water' '14' '1']
 ['food' '20' '5.6']]
[['bread' '50' '1.5']
 ['water' '102' '1']
 ['food' '39' '5.6']]


In [117]:
print(shopping_list2[1,1])

['water' '14' '1']


In [120]:
print(shopping_list2[-2,-2])

['water' '14' '1']


In [122]:
print(shopping_list2[1,1] == shopping_list2[-2,-2])

[ True  True  True]


In [169]:

"""
shopping_list2 = np.array([["2013-01-01",("bread", 50, 1.5),("water", 102,1),("food",39,5.6)],
                          ["2013-01-02",("bread", 21, 1.5),("water", 14,1),("food",20,5.6)],
                          ["2013-01-03",("bread", 50, 1.5),("water", 102,1),("food",39,5.6)]])
                          
"""

'\nshopping_list2 = np.array([["2013-01-01",("bread", 50, 1.5),("water", 102,1),("food",39,5.6)],\n                          ["2013-01-02",("bread", 21, 1.5),("water", 14,1),("food",20,5.6)],\n                          ["2013-01-03",("bread", 50, 1.5),("water", 102,1),("food",39,5.6)]])\n                          \n'

In [125]:
a = np.arange(9)

In [126]:
print(a)

[0 1 2 3 4 5 6 7 8]


In [133]:
print(a[3:7])

[3 4 5 6]


## 多维数组的切片操作

In [135]:
n = np.arange(24).reshape(2,3,4)

In [136]:
print(n)

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]


In [224]:
n.ndim

2

In [167]:
print(n[-1,::])

[[12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]


In [168]:
print(n[::-1])

[[[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]

 [[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]]


In [151]:
print(n[:,1,1])

[ 5 17]


In [150]:
for i in n[:,1,1]:
    print(i)

5
17


In [152]:
print(n[0,:,:])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [153]:
print(n[0,...])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [154]:
print(n[0,1,:])

[4 5 6 7]


In [156]:
print(n[0,1])

[4 5 6 7]


In [137]:
l = np.arange(48).reshape(2,2,3,4)

In [157]:
print(n[0,1,::2])

[4 6]


In [158]:
print(n[:,:,1])

[[ 1  5  9]
 [13 17 21]]


In [159]:
print(n[...,1])

[[ 1  5  9]
 [13 17 21]]


In [160]:
print(n[:,1,:])

[[ 4  5  6  7]
 [16 17 18 19]]


In [161]:
print(n[:,1])

[[ 4  5  6  7]
 [16 17 18 19]]


In [163]:
print(n[0,:,1])

[1 5 9]


In [164]:
print(n[0,:,-1])

[ 3  7 11]


In [165]:
print(n[0,::-1,-1])

[11  7  3]


In [166]:
print(n[0,::2,-1])

[ 3 11]


In [144]:
n.shape

(2, 3, 4)

In [138]:
print(l)

[[[[ 0  1  2  3]
   [ 4  5  6  7]
   [ 8  9 10 11]]

  [[12 13 14 15]
   [16 17 18 19]
   [20 21 22 23]]]


 [[[24 25 26 27]
   [28 29 30 31]
   [32 33 34 35]]

  [[36 37 38 39]
   [40 41 42 43]
   [44 45 46 47]]]]


In [139]:
k = np.arange(96).reshape(2,2,2,3,4)

In [140]:
print(k)

[[[[[ 0  1  2  3]
    [ 4  5  6  7]
    [ 8  9 10 11]]

   [[12 13 14 15]
    [16 17 18 19]
    [20 21 22 23]]]


  [[[24 25 26 27]
    [28 29 30 31]
    [32 33 34 35]]

   [[36 37 38 39]
    [40 41 42 43]
    [44 45 46 47]]]]



 [[[[48 49 50 51]
    [52 53 54 55]
    [56 57 58 59]]

   [[60 61 62 63]
    [64 65 66 67]
    [68 69 70 71]]]


  [[[72 73 74 75]
    [76 77 78 79]
    [80 81 82 83]]

   [[84 85 86 87]
    [88 89 90 91]
    [92 93 94 95]]]]]


## 改变数组的维度

In [170]:
n

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [171]:
n.ravel()  #展平操作，这个只是返回展平后的视图view

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [173]:
n.flatten()  #展平操作，功能与ravel()相同，不同的是这个会请求内存，保存展平后的结果

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [176]:
n.shape = (6,4)  #使用元组对数组进行reshape

In [177]:
n

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [178]:
n.transpose()  #行列转置

array([[ 0,  4,  8, 12, 16, 20],
       [ 1,  5,  9, 13, 17, 21],
       [ 2,  6, 10, 14, 18, 22],
       [ 3,  7, 11, 15, 19, 23]])

In [179]:
n.resize(2,12)  #resize()和reshape()功能一样，resize()会直接修改所操作的数组

In [180]:
n

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]])

## 数组的组合和分割

In [181]:
a = np.arange(9).reshape(3,3)

In [182]:
b = 2*a

In [183]:
a

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [225]:
len(a)

3

In [184]:
b

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])

In [189]:
np.hstack((a,b))  #水平组合

array([[ 0,  1,  2,  0,  2,  4],
       [ 3,  4,  5,  6,  8, 10],
       [ 6,  7,  8, 12, 14, 16]])

In [186]:
np.hstack?

In [190]:
np.vstack((a,b))  #垂直组合

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])

In [192]:
np.concatenate((a,b),axis=1)  #水平组合

array([[ 0,  1,  2,  0,  2,  4],
       [ 3,  4,  5,  6,  8, 10],
       [ 6,  7,  8, 12, 14, 16]])

In [193]:
np.concatenate((a,b),axis=0)  #垂直组合

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])

In [194]:
np.dstack((a,b))  #深度组合，对应位置的要素两两组合

array([[[ 0,  0],
        [ 1,  2],
        [ 2,  4]],

       [[ 3,  6],
        [ 4,  8],
        [ 5, 10]],

       [[ 6, 12],
        [ 7, 14],
        [ 8, 16]]])

In [200]:
oned = np.arange(2) #一维数组 [0,1]

In [201]:
oned

array([0, 1])

In [203]:
twice_oned = 2*oned  #一维数组  [0,2]

In [204]:
twice_oned

array([0, 2])

In [205]:
np.column_stack((oned, twice_oned))  #一维数组列组合,等于把列组合起来

array([[0, 0],
       [1, 2]])

In [206]:
np.row_stack((oned, twice_oned))  #一维数组行组合，等于把行组合起来

array([[0, 1],
       [0, 2]])

In [207]:
a

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [208]:
b

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])

In [209]:
np.column_stack((a,b))  #二维数组列组合等于hstack()

array([[ 0,  1,  2,  0,  2,  4],
       [ 3,  4,  5,  6,  8, 10],
       [ 6,  7,  8, 12, 14, 16]])

In [210]:
np.column_stack((a,b)) == np.hstack((a,b))

array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True]], dtype=bool)

In [211]:
np.row_stack((a,b)) # 二维数组行组合等于vstack()

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])

In [212]:
np.row_stack((a,b)) == np.vstack((a,b))

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]], dtype=bool)

In [213]:
a

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [216]:
np.hsplit(a,3)  #水平分割

[array([[0],
        [3],
        [6]]), array([[1],
        [4],
        [7]]), array([[2],
        [5],
        [8]])]

In [217]:
np.vsplit(a,3)  #垂直分割

[array([[0, 1, 2]]), array([[3, 4, 5]]), array([[6, 7, 8]])]

In [218]:
np.split(a,3,axis=1)  #等于同水平分割

[array([[0],
        [3],
        [6]]), array([[1],
        [4],
        [7]]), array([[2],
        [5],
        [8]])]

In [219]:
np.split(a,3,axis=0)  #等同于垂直分割

[array([[0, 1, 2]]), array([[3, 4, 5]]), array([[6, 7, 8]])]

In [221]:
c = np.arange(27).reshape(3,3,3)

In [222]:
c

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [227]:
len(c)

3

In [228]:
c.ndim

3

In [229]:
c.shape

(3, 3, 3)

In [230]:
c.size

27

In [231]:
d = np.arange(64).reshape(4,8,2)

In [232]:
d

array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15]],

       [[16, 17],
        [18, 19],
        [20, 21],
        [22, 23],
        [24, 25],
        [26, 27],
        [28, 29],
        [30, 31]],

       [[32, 33],
        [34, 35],
        [36, 37],
        [38, 39],
        [40, 41],
        [42, 43],
        [44, 45],
        [46, 47]],

       [[48, 49],
        [50, 51],
        [52, 53],
        [54, 55],
        [56, 57],
        [58, 59],
        [60, 61],
        [62, 63]]])

In [233]:
len(d)

4

In [234]:
d.ndim

3

In [235]:
e = np.arange(60).reshape(3,2,2,5)

In [236]:
e

array([[[[ 0,  1,  2,  3,  4],
         [ 5,  6,  7,  8,  9]],

        [[10, 11, 12, 13, 14],
         [15, 16, 17, 18, 19]]],


       [[[20, 21, 22, 23, 24],
         [25, 26, 27, 28, 29]],

        [[30, 31, 32, 33, 34],
         [35, 36, 37, 38, 39]]],


       [[[40, 41, 42, 43, 44],
         [45, 46, 47, 48, 49]],

        [[50, 51, 52, 53, 54],
         [55, 56, 57, 58, 59]]]])

In [237]:
e.ndim

4

In [238]:
len(e)

3

In [223]:
np.dsplit(c,3)

[array([[[ 0],
         [ 3],
         [ 6]],
 
        [[ 9],
         [12],
         [15]],
 
        [[18],
         [21],
         [24]]]), array([[[ 1],
         [ 4],
         [ 7]],
 
        [[10],
         [13],
         [16]],
 
        [[19],
         [22],
         [25]]]), array([[[ 2],
         [ 5],
         [ 8]],
 
        [[11],
         [14],
         [17]],
 
        [[20],
         [23],
         [26]]])]