# <font face = "微软雅黑" color = blue size = 6>实验名称</font>

## pandas的基本操作(一）

# <font face = "微软雅黑" color = blue size = 6>实验目的</font>

## 通过该实验的实践，要求学生可以掌握Pandas的数据结构Series、DataFrame，能够使用Pandas进行数据的存取、处理、运算、统计等基本操作

# <font face = "微软雅黑" color = blue size = 6>实验背景</font>

## Pandas是一个强大的分析结构化数据的工具集；它的使用基础是Numpy（提供高性能的矩阵运算）；用于数据挖掘和数据分析，同时也提供数据清洗功能。

# <font face = "微软雅黑" color = blue size = 6>实验原理</font>

## Python Data Analysis Library 或 pandas 是连接 SciPy和 NumPy的一种工具，该工具是为了解决数据分析任务而创建的，通过数据结构Series、DataFrame实现功能：

## 1、Series是一种类似于一维数组的对象，是由一组数据(各种NumPy数据类型)以及一组与之相关的数据标签(即索引)组成。仅由一组数据也可产生简单的Series对象

## 2、DataFrame是Pandas中的一个表格型的数据结构，包含有一组有序的列，每列可以是不同的值类型(数值、字符串、布尔型等)，DataFrame即有行索引也有列索引，可以被看做是由Series组成的字典

# <font face = "微软雅黑" color = blue size = 6>实验环境</font>

## Windows操作系统 64bits
## python 3.6以上
## numpy1.17以上
## pandas1.0.2

# <font face = "微软雅黑" color = blue size = 6>实验步骤</font>

## 2 pandas数据结构之Series

## 2.1 创建Series

In [1]:
# 导入pandas和numpy
!pip install  numpy
!pip install  pandas
import pandas as pd
import numpy as np



### 2.1.1 从ndarray创建Series

In [2]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.312709
b   -1.539938
c    0.755102
d   -0.361479
e   -1.987013
dtype: float64

In [3]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
# 不传入索引
t = pd.Series(np.random.randn(5))
t

0    1.306700
1    0.098516
2    0.888152
3   -0.244478
4    0.544185
dtype: float64

In [5]:
t.index

RangeIndex(start=0, stop=5, step=1)

### 2.1.2 从字典或列表创建Series

In [6]:
d = {'a' : 1, 'b' : 3, 'c' :5}
# 不传入索引
pd.Series(d)

a    1
b    3
c    5
dtype: int64

In [7]:
# 传入索引，以索引为准
pd.Series(d,index=['a','b','c','d','e'])

a    1.0
b    3.0
c    5.0
d    NaN
e    NaN
dtype: float64

In [8]:
# 列表
s = pd.Series([x for x in range(1,5)],list('abcd'))
s

a    1
b    2
c    3
d    4
dtype: int64

In [9]:
t = pd.Series([x for x in range(1,5)])
t

0    1
1    2
2    3
3    4
dtype: int64

### 2.1.3 从标量创建

In [10]:
pd.Series(999)

0    999
dtype: int64

In [11]:
pd.Series(999,['a','b','c','d'])

a    999
b    999
c    999
d    999
dtype: int64

## 2.2 对Series的操作

### 2.2.1 Series和ndarray相似的操作

In [12]:
s = pd.Series(np.random.randn(3), index=['a', 'b', 'c'])
s

a    0.671225
b   -0.765733
c    1.416107
dtype: float64

In [13]:
s[0]

0.6712250924990213

In [14]:
s['b']

-0.7657329456110621

In [15]:
s['c'] = 999
s

a      0.671225
b     -0.765733
c    999.000000
dtype: float64

In [16]:
'a' in s

True

In [17]:
'a' in s

True

In [18]:
#固定错误
s['f']

KeyError: 'f'

In [19]:
s.get('b')

-0.7657329456110621

In [20]:
s.get('f')

In [21]:
s.get('f','f不在索引列表中')

'f不在索引列表中'

In [22]:
s

a      0.671225
b     -0.765733
c    999.000000
dtype: float64

In [23]:
s[:3]

a      0.671225
b     -0.765733
c    999.000000
dtype: float64

In [24]:
s[:'c']

a      0.671225
b     -0.765733
c    999.000000
dtype: float64

In [25]:
s[1:3]

b     -0.765733
c    999.000000
dtype: float64

In [26]:
# 数字索引在任何时候都有效
s[[1,2]]

b     -0.765733
c    999.000000
dtype: float64

In [27]:
s[s>s.median()]

c    999.0
dtype: float64

### 2.2.2 向量化运算

In [28]:
s = pd.Series([1,2,3,4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [29]:
s + s

0    2
1    4
2    6
3    8
dtype: int64

In [30]:
s * 3

0     3
1     6
2     9
3    12
dtype: int64

In [31]:
np.sqrt(s)

0    1.000000
1    1.414214
2    1.732051
3    2.000000
dtype: float64

In [32]:
np.square(s)

0     1
1     4
2     9
3    16
dtype: int64

In [33]:
s[s>s.median()]

2    3
3    4
dtype: int64

### 2.2.3 类似字典的操作

In [34]:
s = pd.Series([x for x in range(1,10)],list('abcdefghi'))
s

a    1
b    2
c    3
d    4
e    5
f    6
g    7
h    8
i    9
dtype: int64

In [35]:
# 列表解析
[x for x in range(1,10)]

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [36]:
# list函数把字符串变成列表
list('abcdefjhi')

['a', 'b', 'c', 'd', 'e', 'f', 'j', 'h', 'i']

In [37]:
s['a']

1

In [38]:
s['a'] = 999
s

a    999
b      2
c      3
d      4
e      5
f      6
g      7
h      8
i      9
dtype: int64

In [39]:
'i' in s

True

In [40]:
'j' in s

False

In [41]:
#固定错误
s['j']

KeyError: 'j'

In [42]:
s.get('j')

In [43]:
s.get('j','索引中不存在j')

'索引中不存在j'

### 2.2.4 时间序列操作

In [44]:
# 生成从2018年9月1日开始，10天的时间序列索引，频率为10分钟
rng = pd.date_range('9/1/2018', periods=1440, freq='10Min')
rng[:5]

DatetimeIndex(['2018-09-01 00:00:00', '2018-09-01 00:10:00',
               '2018-09-01 00:20:00', '2018-09-01 00:30:00',
               '2018-09-01 00:40:00'],
              dtype='datetime64[ns]', freq='10T')

In [45]:
ts = pd.Series(np.random.randn(1440),index = rng)
ts.head()

2018-09-01 00:00:00   -0.019110
2018-09-01 00:10:00   -0.542676
2018-09-01 00:20:00   -0.110572
2018-09-01 00:30:00    0.676665
2018-09-01 00:40:00    1.990581
Freq: 10T, dtype: float64

In [46]:
ts.count()

1440

In [47]:
ts[:5]

2018-09-01 00:00:00   -0.019110
2018-09-01 00:10:00   -0.542676
2018-09-01 00:20:00   -0.110572
2018-09-01 00:30:00    0.676665
2018-09-01 00:40:00    1.990581
Freq: 10T, dtype: float64

In [48]:
ts[:10:2]

2018-09-01 00:00:00   -0.019110
2018-09-01 00:20:00   -0.110572
2018-09-01 00:40:00    1.990581
2018-09-01 01:00:00   -1.293651
2018-09-01 01:20:00    0.513392
Freq: 20T, dtype: float64

In [49]:
# 改变时间频率
converted = ts.asfreq('30Min')
converted.head()

2018-09-01 00:00:00   -0.019110
2018-09-01 00:30:00    0.676665
2018-09-01 01:00:00   -1.293651
2018-09-01 01:30:00    0.492857
2018-09-01 02:00:00   -1.487883
Freq: 30T, dtype: float64

In [50]:
ss = ts.resample('D')
ss

<pandas.core.resample.DatetimeIndexResampler object at 0x0000020C4AD65CD0>

In [51]:
# 按天粒度汇总
resampled = ts.resample('D').count()
resampled

2018-09-01    144
2018-09-02    144
2018-09-03    144
2018-09-04    144
2018-09-05    144
2018-09-06    144
2018-09-07    144
2018-09-08    144
2018-09-09    144
2018-09-10    144
Freq: D, dtype: int64

In [52]:
# 对时间序列数据进行索引
resampled[:5]

2018-09-01    144
2018-09-02    144
2018-09-03    144
2018-09-04    144
2018-09-05    144
Freq: D, dtype: int64

In [53]:
resampled[::2]

2018-09-01    144
2018-09-03    144
2018-09-05    144
2018-09-07    144
2018-09-09    144
Freq: 2D, dtype: int64

In [54]:
# 时间字符串索引
ts['9/6/2018'][140:]

2018-09-06 23:20:00    0.225587
2018-09-06 23:30:00   -0.252573
2018-09-06 23:40:00   -0.790751
2018-09-06 23:50:00   -3.544603
Freq: 10T, dtype: float64

In [55]:
# datetime类型的索引
from datetime import datetime
ts[datetime(2018,9,9):][::60]

2018-09-09 00:00:00   -0.314764
2018-09-09 10:00:00   -0.754708
2018-09-09 20:00:00   -0.148387
2018-09-10 06:00:00    0.540702
2018-09-10 16:00:00    0.817701
Freq: 600T, dtype: float64

## 3 pandas数据结构之DataFrame

## 3.1 DataFrame的创建

### 3.1.1 从Series or dicts创建

In [56]:
# 创建一个字典对象，字典的值为一个一维序列
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
pd.DataFrame(d)

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [57]:
# 将字典转换成DataFrame,字典的键将会成为DataFrame的column's name
pd.DataFrame(d,index =['a','b','c'])

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0


In [58]:
# 指定索引名称，生成DataFrame
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [59]:
# 指定索引名称，生成DataFrame
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


### 3.1.2 从ndarrays或lists的字典创建

In [60]:
# list的长度需要一致,组成字典
d = {'one':[1,2,3,4,5,6,7],
    'two':['a','b','c','d','e','f','g'],
    'three':[False,True,True,True,False,False,True]}
pd.DataFrame(d)

Unnamed: 0,one,two,three
0,1,a,False
1,2,b,True
2,3,c,True
3,4,d,True
4,5,e,False
5,6,f,False
6,7,g,True


In [61]:
pd.DataFrame(d,columns=['one','two','three'])

Unnamed: 0,one,two,three
0,1,a,False
1,2,b,True
2,3,c,True
3,4,d,True
4,5,e,False
5,6,f,False
6,7,g,True


In [62]:
# 如果传入索引，则索引的长度必须和ndarray数组或list的长度一致 
pd.DataFrame(d,columns=['one','three'],index = list('abcdefg'))

Unnamed: 0,one,three
a,1,False
b,2,True
c,3,True
d,4,True
e,5,False
f,6,False
g,7,True


### 3.1.3 从结构化或成对的array/list创建

In [63]:
# from ndarray
array = np.array([[1,2,3,4],[4,5,6,7]])
array

array([[1, 2, 3, 4],
       [4, 5, 6, 7]])

In [64]:
pd.DataFrame(array,columns=list('abcd'))

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,4,5,6,7


In [65]:
array = np.arange(1,13).reshape(3,4)
array

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [66]:
a = [[[1,2],[3,4]],[[5,6],[7,8]]]
pd.DataFrame(a)

Unnamed: 0,0,1
0,"[1, 2]","[3, 4]"
1,"[5, 6]","[7, 8]"


In [67]:
array = np.array(a)
array

array([[[1, 2],
        [3, 4]],

       [[5, 6],
        [7, 8]]])

In [68]:
#固定错误
pd.DataFrame(np.array(a))

ValueError: Must pass 2-d input. shape=(2, 2, 2)

In [70]:
# from list
c = [[1,2,3,4],[4,5,6,7]]

In [69]:
pd.DataFrame(c,index=list('xy'),columns=list('abcd'))

NameError: name 'c' is not defined

In [71]:
pd.DataFrame(c)

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,4,5,6,7


### 3.1.3 从字典的列表创建

In [72]:
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [73]:
pd.DataFrame(data,columns=['a','b'],index=['first','second'])

Unnamed: 0,a,b
first,1,2
second,5,10


## 3.2 变量选择、添加和删除

In [74]:
df = pd.DataFrame({'one':[1,2,3,4,5,6],
                 'two':[2,3,4,5,6,7]})
df

Unnamed: 0,one,two
0,1,2
1,2,3
2,3,4
3,4,5
4,5,6
5,6,7


In [75]:
# 查看某列
df['one']

0    1
1    2
2    3
3    4
4    5
5    6
Name: one, dtype: int64

In [76]:
# 选择满足一定条件的记录
df[df['one'] > 3]

Unnamed: 0,one,two
3,4,5
4,5,6
5,6,7


In [77]:
# 基于已有列添加新列
df['three'] = df['one'] + df['two']
df['four'] = df['one'] > 2
df

Unnamed: 0,one,two,three,four
0,1,2,3,False
1,2,3,5,False
2,3,4,7,True
3,4,5,9,True
4,5,6,11,True
5,6,7,13,True


In [78]:
# 添加一个新的列，并赋值一个标量，
#则会自动为整个列赋值为该标量值
df['five'] = 9999
df

Unnamed: 0,one,two,three,four,five
0,1,2,3,False,9999
1,2,3,5,False,9999
2,3,4,7,True,9999
3,4,5,9,True,9999
4,5,6,11,True,9999
5,6,7,13,True,9999


In [79]:
df['six'] = pd.Series(['b','c','d'],index = [1,2,3])
df

Unnamed: 0,one,two,three,four,five,six
0,1,2,3,False,9999,
1,2,3,5,False,9999,b
2,3,4,7,True,9999,c
3,4,5,9,True,9999,d
4,5,6,11,True,9999,
5,6,7,13,True,9999,


In [80]:
df.insert(2,'insert_col2',list('abcdef'))
df

Unnamed: 0,one,two,insert_col2,three,four,five,six
0,1,2,a,3,False,9999,
1,2,3,b,5,False,9999,b
2,3,4,c,7,True,9999,c
3,4,5,d,9,True,9999,d
4,5,6,e,11,True,9999,
5,6,7,f,13,True,9999,


In [81]:
# 选择满足一定条件的记录
df[df['one'] > 3]

Unnamed: 0,one,two,insert_col2,three,four,five,six
3,4,5,d,9,True,9999,d
4,5,6,e,11,True,9999,
5,6,7,f,13,True,9999,


In [82]:
three = df.pop('three')
three

0     3
1     5
2     7
3     9
4    11
5    13
Name: three, dtype: int64

In [83]:
del df['two']
df

Unnamed: 0,one,insert_col2,four,five,six
0,1,a,False,9999,
1,2,b,False,9999,b
2,3,c,True,9999,c
3,4,d,True,9999,d
4,5,e,True,9999,
5,6,f,True,9999,


In [84]:
# 当为新的列赋值为Series对象时，
# 将会根据索引来进行匹配，没有匹配到索引的，将会填充为NaN
df['series'] = pd.Series(['b','c','d'],index = [1,2,3])
df

Unnamed: 0,one,insert_col2,four,five,six,series
0,1,a,False,9999,,
1,2,b,False,9999,b,b
2,3,c,True,9999,c,c
3,4,d,True,9999,d,d
4,5,e,True,9999,,
5,6,f,True,9999,,


In [85]:
# 向当前数据框的指定位置插入一列
df.insert(2,'insert_col',list('abcdef'))
df

Unnamed: 0,one,insert_col2,insert_col,four,five,six,series
0,1,a,a,False,9999,,
1,2,b,b,False,9999,b,b
2,3,c,c,True,9999,c,c
3,4,d,d,True,9999,d,d
4,5,e,e,True,9999,,
5,6,f,f,True,9999,,
