<a href="https://colab.research.google.com/github/jackqk/pandas-note/blob/master/Pandas_Series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Series Data Structure

In [0]:
import pandas as pd
# pd.Series?caocaocao

# 一、创建Series

## 从list创建
自动帮你生成从0开始的整数索引

In [0]:
#注意dtype是object
animals = ['tiger', 'bear', 'moose']
pd.Series(animals)

0    tiger
1     bear
2    moose
dtype: object

In [0]:
numbers = [1,2,3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [0]:
s = pd.Series(['tiger', 'bear', 'moose'], index = ['India', 'America', 'Cannad'])
s

India      tiger
America     bear
Cannad     moose
dtype: object

## 从dictionary创建
可以给索引命名

In [0]:
sports = {'Archery' : 'Bhutan',
         'Golf' : 'Scotland',
         'Sumo' : 'Japan',
         'Taekwondo' : 'South Koear' }
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Koear
dtype: object

In [0]:
s.index

Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

## pandas对python的None处理

In [0]:
#自动转为Object
animals = ['tiger', 'bear', None]
pd.Series(animals)

0    tiger
1     bear
2     None
dtype: object

In [0]:
#自动转为np.nan
numbers = [1,2,None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [0]:
#这个NaN十分有趣
import numpy as np

print(np.nan == None)
print(np.nan == np.nan)
  print(np.isnan(np.nan))

False
False
True


# 二、查询/选择

## 像list，dict那样查询

In [0]:
sports = {'Archery' : 'Bhutan',
         'Golf' : 'Scotland',
         'Sumo' : 'Japan',
         'Taekwondo' : 'South Koear' }
s = pd.Series(sports)

In [0]:
#位置查询（如同list）
#这之间有一个意外，就是当index为整数时，那么这种查询就会失效
s[0]

'Bhutan'

In [0]:
#index name
s['Golf']

'Scotland'

## iloc、loc查询
loc如果传入的没有的列，这会创建一个新的列

In [0]:
#查第几行
s.iloc[1]

'Scotland'

In [0]:
#列名查找
s.loc['Golf']

'Scotland'

## 当索引是整数是时出现的问题

In [0]:
#当索引是整数时，
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [0]:
#出错（不能以list方式来查找）
s[0]

In [0]:
s.iloc[0]

'Bhutan'

## 选择多个index

In [0]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [0]:
# 拥有相同index
all_countries['Cricket']

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [0]:
# 切片,可以选择多个Index
all_countries[0:3]

Archery      Bhutan
Golf       Scotland
Sumo          Japan
dtype: object

In [0]:
# 选指定几个，暂时未解决


# 三、Operation

## 遍历
在遍历前，想想是否能够使用向量化，因为更快

**方法一：**<br>

In [0]:
#默认遍历value
#可以用s.keys()方法，来指定遍历什么
s = pd.Series([100.0, 110.0, 130.0,22.0])
for item in s:
    print(item)

100.0
110.0
130.0
22.0


**方法二：**

In [0]:
#key value都遍历出来
for key, value in s.iteritems():
  s.at[key] = value + 20
s

0    120.0
1    130.0
2    150.0
3     42.0
dtype: float64

## 数学运算

In [0]:
s = pd.Series([100.0, 110.0, 130.0,22.0])
s

0    100.0
1    110.0
2    130.0
3     22.0
dtype: float64

In [0]:
np.sum(s)

362.0

In [0]:
s += 2
s

0    102.0
1    112.0
2    132.0
3     24.0
dtype: float64

In [0]:
s = s ** 2
s

0    10404.0
1    12544.0
2    17424.0
3      576.0
dtype: float64

## 关于速度

In [0]:
s = pd.Series(np.random.randint(0, 1000, 10000))
s.head()

0    567
1    553
2    820
3    342
4    420
dtype: int64

In [0]:
%%timeit -n 100
#下面的执行100次，取最佳时间
summary = 0
for item in s:
  summary += item

100 loops, best of 3: 1.08 ms per loop


In [0]:
%%timeit -n 100
summary = np.sum(s)

100 loops, best of 3: 130 µs per loop


## append


In [0]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [0]:
#上面用了append之后，并没有改变
original_sports

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [0]:
cricket_loving_countries

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [0]:
all_countries

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object