## pandas是基于Numpy构建的。目的是以numpy未中心的应用变得更加的简单。

In [5]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

## Series类型
#### 这个类型就类似于一维数组对象。它是由一组数据以及一组与之相关的数据标签（索引）组成的。

In [3]:
obj = Series([1, 2, 3, 4, 5, 6])
print(obj)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [5]:
print(obj.values)

[1 2 3 4 5 6]


In [6]:
obj.index

RangeIndex(start=0, stop=6, step=1)

## 自定义索引

In [14]:
obj = Series(['a', 'b','c','d','e'], index=[1, 2, 3, 4, 5])
print(obj)

1    a
2    b
3    c
4    d
5    e
dtype: object


In [15]:
obj[1]

'a'

In [16]:
data ={'a': 1000, 'b': 2000, 'c': 30000}
obj = Series(data)
print(obj)

a     1000
b     2000
c    30000
dtype: int64


In [18]:
obj['a']

1000

In [19]:
print(obj.values)

[ 1000  2000 30000]


In [20]:
keys = ['a', 'c']
obj_1 = Series(data, index = keys)
print(obj_1)

a     1000
c    30000
dtype: int64


## 缺失数据处理

In [21]:
data ={'a': 1000, 'b': 2000, 'c': 30000}
obj = Series(data)
pd.isnull(obj)

a    False
b    False
c    False
dtype: bool

In [22]:
pd.notnull(obj)

a    True
b    True
c    True
dtype: bool

In [23]:
data = {'LiLei':None, 'HanMeimei':25, 'Tony':None, 'Jack': 50}
obj = Series(data)
print(obj)

LiLei         NaN
HanMeimei    25.0
Tony          NaN
Jack         50.0
dtype: float64


In [25]:
obj.name = "NameAndAge"
obj

LiLei         NaN
HanMeimei    25.0
Tony          NaN
Jack         50.0
Name: NameAndAge, dtype: float64

In [29]:
obj.index.name = "姓名"
# obj.values.name = "年龄"
obj

姓名
LiLei         NaN
HanMeimei    25.0
Tony          NaN
Jack         50.0
Name: NameAndAge, dtype: float64

### DataFram 类型
#### DataFram 是一个表格型的数据结构，它含有一组有序的列，每列可以是不同值的类型，数值、字符串、布尔值都可以
#### DataFram 本身有行索引，也有列索引
####  DataFram可以理解成是由Series组成的字典。

In [30]:
data = {
    '60年代': ['狗子', '嘎子', '二妞'],
    '70年代': ['卫国', '爱国', '建国'],
    '80年代': ['李雷', '韩梅梅', '张伟'],
}
frame_data = DataFrame(data)
frame_data

Unnamed: 0,60年代,70年代,80年代
0,狗子,卫国,李雷
1,嘎子,爱国,韩梅梅
2,二妞,建国,张伟


In [31]:
frame_data["70年代"]

0    卫国
1    爱国
2    建国
Name: 70年代, dtype: object

In [32]:
frame_data["70年代"][1]

'爱国'

In [37]:
import numpy as np
dates = pd.date_range('20200416', periods=6)
dates

DatetimeIndex(['2020-04-16', '2020-04-17', '2020-04-18', '2020-04-19',
               '2020-04-20', '2020-04-21'],
              dtype='datetime64[ns]', freq='D')

In [39]:
df = pd.DataFrame(np.random.rand(6,4), index = dates, columns=list('ABCD'))

df


Unnamed: 0,A,B,C,D
2020-04-16,0.810615,0.612985,0.151667,0.110414
2020-04-17,0.158104,0.476214,0.947933,0.978418
2020-04-18,0.216514,0.563663,0.520706,0.907799
2020-04-19,0.847482,0.464116,0.131685,0.146822
2020-04-20,0.112272,0.950125,0.321698,0.924828
2020-04-21,0.909353,0.488854,0.121793,0.748937


In [40]:
df.transpose()

Unnamed: 0,2020-04-16,2020-04-17,2020-04-18,2020-04-19,2020-04-20,2020-04-21
A,0.810615,0.158104,0.216514,0.847482,0.112272,0.909353
B,0.612985,0.476214,0.563663,0.464116,0.950125,0.488854
C,0.151667,0.947933,0.520706,0.131685,0.321698,0.121793
D,0.110414,0.978418,0.907799,0.146822,0.924828,0.748937


In [41]:
df['20200416':'20200420']

Unnamed: 0,A,B,C,D
2020-04-16,0.810615,0.612985,0.151667,0.110414
2020-04-17,0.158104,0.476214,0.947933,0.978418
2020-04-18,0.216514,0.563663,0.520706,0.907799
2020-04-19,0.847482,0.464116,0.131685,0.146822
2020-04-20,0.112272,0.950125,0.321698,0.924828


In [43]:
df.loc['20200416':'20200420', ['A', 'B']]

Unnamed: 0,A,B
2020-04-16,0.810615,0.612985
2020-04-17,0.158104,0.476214
2020-04-18,0.216514,0.563663
2020-04-19,0.847482,0.464116
2020-04-20,0.112272,0.950125


In [44]:
df.head(2)

Unnamed: 0,A,B,C,D
2020-04-16,0.810615,0.612985,0.151667,0.110414
2020-04-17,0.158104,0.476214,0.947933,0.978418


In [45]:
df.tail(3)

Unnamed: 0,A,B,C,D
2020-04-19,0.847482,0.464116,0.131685,0.146822
2020-04-20,0.112272,0.950125,0.321698,0.924828
2020-04-21,0.909353,0.488854,0.121793,0.748937


### pandas的重新索引

In [46]:
obj = Series([3.4, 2.4, -2.3], index = ['a', 'b', 'c'])
obj

a    3.4
b    2.4
c   -2.3
dtype: float64

In [49]:
obj_1 = obj.reindex(['a', 'b','c','d','e'])
obj_1

a    3.4
b    2.4
c   -2.3
d    NaN
e    NaN
dtype: float64

In [51]:
obj_2 = obj.reindex(['a', 'b','c','d','e'], fill_value=0)
obj_2

a    3.4
b    2.4
c   -2.3
d    0.0
e    0.0
dtype: float64

In [52]:
obj = Series([3.4, 2.4, -2.3], index = [0, 2, 4])
obj

0    3.4
2    2.4
4   -2.3
dtype: float64

In [53]:
# 向前填充
obj.reindex(range(6), method='ffill')

0    3.4
1    3.4
2    2.4
3    2.4
4   -2.3
5   -2.3
dtype: float64

In [54]:
# 向后填充
obj.reindex(range(6), method='bfill')

0    3.4
1    2.4
2    2.4
3   -2.3
4   -2.3
5    NaN
dtype: float64

## 算数运算和数据对齐

In [55]:
d1 = Series([1.3,2.3,3.3, -4.5], index=['a', 'b','c','d'])
d1

a    1.3
b    2.3
c    3.3
d   -4.5
dtype: float64

In [57]:
d2= Series([-1.3,-2.3,-3.3, 4.9, 9.4], index=['a', 'b','c','d', 'e'])
d2

a   -1.3
b   -2.3
c   -3.3
d    4.9
e    9.4
dtype: float64

In [58]:
d1 +d2

a    0.0
b    0.0
c    0.0
d    0.4
e    NaN
dtype: float64

In [61]:
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list('abc'), index=[1, 2, 3])
df1

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8


In [63]:
df2 = DataFrame(np.arange(12).reshape(4,3), columns=list('cde'), index=[1, 2, 3,4])
df2

Unnamed: 0,c,d,e
1,0,1,2
2,3,4,5
3,6,7,8
4,9,10,11


In [64]:
df1 + df2 # 在行和列上都发生了操作，没有重叠的部分都是空

Unnamed: 0,a,b,c,d,e
1,,,2.0,,
2,,,8.0,,
3,,,14.0,,
4,,,,,


In [66]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
1,0.0,1.0,2.0,1.0,2.0
2,3.0,4.0,8.0,4.0,5.0
3,6.0,7.0,14.0,7.0,8.0
4,,,9.0,10.0,11.0


In [67]:
### 算数方法  add 加法    sub  减法   div  除法   mul  乘法

## 排序

In [68]:
obj = Series(range(4), index=['a', 'd', 'c', 'e'])
obj

a    0
d    1
c    2
e    3
dtype: int64

In [69]:
obj.sort_index()

a    0
c    2
d    1
e    3
dtype: int64

In [70]:
obj.sort_values()

a    0
d    1
c    2
e    3
dtype: int64

## 针对DataFrame的排序

In [72]:
frame = DataFrame(np.arange(8).reshape(2,4), index=['two', 'one'], columns=['c', 'a', 'b', 'd'])
frame

Unnamed: 0,c,a,b,d
two,0,1,2,3
one,4,5,6,7


In [73]:
frame.sort_index()

Unnamed: 0,c,a,b,d
one,4,5,6,7
two,0,1,2,3


In [74]:
frame.sort_index(axis=1)  # axis=1  按列轴排序，默认为0

Unnamed: 0,a,b,c,d
two,1,2,0,3
one,5,6,4,7


In [75]:
frame = DataFrame({
    'b':[1, 3 ,5 ,2 ,8],
    'a':[0, 31,2,1, 9 ]
})
frame

Unnamed: 0,b,a
0,1,0
1,3,31
2,5,2
3,2,1
4,8,9


In [77]:
frame.sort_values(by="b")

Unnamed: 0,b,a
0,1,0
3,2,1
1,3,31
2,5,2
4,8,9


In [6]:
data = Series(np.random.randn(9), index=[['a','a','a','b','b','b','c','c','c'],[1, 2, 3, 1,2,3,1,2,3]])
data



a  1    0.103819
   2   -1.154700
   3   -0.210779
b  1    2.030430
   2    0.182221
   3   -0.782418
c  1   -1.328695
   2    0.779805
   3   -1.486001
dtype: float64

In [79]:
data['a']

1   -0.144172
2   -1.233142
3   -1.069355
dtype: float64

In [80]:
data['a'][1]

-0.14417156317111446

In [81]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('c', 3)],
           )

In [7]:
data[:, 2]

a   -1.154700
b    0.182221
c    0.779805
dtype: float64

## pandas 文本格式数据处理