## 5.2.6 Function Application and Mapping 函数应用和映射
#### NumPy函数可以用

In [1]:
import numpy as np
import pandas as pd  ## 默认导入模式
from pandas import Series,DataFrame  ## 常用的两个工具 
frame=pd.DataFrame(np.random.randn(4,3),columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.453511,1.382404,-0.974482
Ohio,0.137393,-1.454968,0.166101
Texas,1.027346,1.180105,-0.030648
Oregon,1.029182,0.471175,1.507136


In [2]:
np.abs(frame)  # 取绝对值

Unnamed: 0,b,d,e
Utah,1.453511,1.382404,0.974482
Ohio,0.137393,1.454968,0.166101
Texas,1.027346,1.180105,0.030648
Oregon,1.029182,0.471175,1.507136


#### 如果想要实现，一列中的最大值-最小值
#### 也就是要对每一列执行同一个函数操作
### 用apply

In [3]:
f=lambda x:x.max()-x.min()
frame.apply(f)

b    2.482693
d    2.837372
e    2.481618
dtype: float64

In [4]:
frame.apply(f,axis='columns')
#  沿着columns的方向
#  每行来一次

Utah      2.835915
Ohio      1.621069
Texas     1.210752
Oregon    1.035961
dtype: float64

#### 返回的值可以带有多个值的Series,比如想留下最大值和最小值

In [6]:
def f(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])  # 每一列要做的操作
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.453511,-1.454968,-0.974482
max,1.029182,1.382404,1.507136


#### frame里面都是浮点数
#### 这个浮点数保留两位，变成字符串
### 用applymap

In [8]:
format=lambda x:'%.2f' % x
#  保留两位小数
frame.applymap(format)
#  这里的applymap，对每一列都进行操作
#  区别于map,是对某一列，也就是某Series进行操作

Unnamed: 0,b,d,e
Utah,-1.45,1.38,-0.97
Ohio,0.14,-1.45,0.17
Texas,1.03,1.18,-0.03
Oregon,1.03,0.47,1.51


In [10]:
frame['e'].map(format)
#  map方法

Utah      -0.97
Ohio       0.17
Texas     -0.03
Oregon     1.51
Name: e, dtype: object

## 5.2.7 Sorting and Ranking 排序和排名
### 按索引排序，用sort_index
### 按值排序，用sort_values

In [11]:
obj=pd.Series(range(4),index=['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [16]:
# 默认按axis=0排序
frame=pd.DataFrame(np.arange(8).reshape((2,4)),
                  index=['three','one'],
                  columns=['d','a','b','c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [17]:
# 给axis=1排序
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [19]:
# 默认升序，降序为ascending=False
frame.sort_index(axis=1,ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [20]:
obj=pd.Series([4,7,-3,2])
# 默认index为0123...
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

#### 缺失值会被排序至Series的尾部

In [22]:
obj=pd.Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

#### 给DataFrame,就是表格排序
#### sort_values(by=排序的根据)

In [23]:
frame=pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [24]:
frame.sort_values(by='b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [25]:
frame.sort_values(by=['a','b'])
# 排序优先次序，a然后b

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [29]:
obj=pd.Series([7,-5,7,4,2,0,4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [32]:
obj.rank()
# 按值从小到大
# 排名，1：-5排在第一位------1.0
#  5：0排在第二位------2.0
# 4：2排在第三位-----3.0
#  6：4，3：4相等，并列第四，也就是第四和第五，所以都是4.5位
# 以此类推

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [33]:
obj.rank(method='first')
# 3和6对应的值都是4
# method=first 就是先看到的优先
# 也就是3：4排在第四位
# 6：4排在第五位

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [34]:
obj.rank(ascending=False,method='max')
# 首先是反序，由大到小
# [7,-5,7,4,2,0,4]
# 0:7和2：7，最大，在第一位和第二位
# method=max就是取最大的值
# 也就是两个排名都是2

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [35]:
frame=pd.DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [37]:
frame.rank(axis='columns')
# 明显是abc对应的三个数的排名

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


## 5.2.8 Axis Indexes with Duplicate Labels 含有重复标签的轴索引

In [38]:
# 如果索引有重复的，比如两个a，两个b
obj=pd.Series(range(5),index=['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [39]:
# 判断索引元素是否唯一
obj.index.is_unique

False

In [40]:
obj['a']

a    0
a    1
dtype: int32

In [41]:
obj['c']  # 结果为数

4

In [43]:
# 如果为4*3
df=pd.DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,0.153929,0.312893,1.02574
a,1.202744,-1.136753,-0.202847
b,-0.948271,1.407901,-1.747372
b,-0.681919,-2.010403,0.672949


In [44]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.948271,1.407901,-1.747372
b,-0.681919,-2.010403,0.672949


## 5.3 Summarizing and Computing Descriptive Statistics 描述性统计的概述与计算
#### 主要是进行统计运算时，对缺失值的处理

In [45]:
df=pd.DataFrame([[1.4,np.nan],[7.1,-4.5],
                [np.nan,np.nan],[0.75,-1.3]],
                index=['a','b','c','d'],
                columns=['one','two'])
#  建立一个有缺失值的表
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [46]:
df.sum()
# NaN默认为0
# 默认计算axis=0方向，竖着加

one    9.25
two   -5.80
dtype: float64

In [48]:
df.sum(axis='columns')
# 计算axis=1方向，横着加，c行印证了NaN默认为0

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [49]:
# 如果想保留缺失值，则加入参数skipna=False
df.mean(axis='columns',skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [50]:
# 最大值的索引值，one这一列，最大值为7.10，对应索引为b
df.idxmax()

one    b
two    d
dtype: object

In [51]:
df.cumsum()
# 积累型计算
# 1.4
# 1.4+7.1=8.5
# NaN
# 1.4+7.1+0.75=9.25

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [52]:
# 一次性产生多个汇总统计
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [53]:
# 对于非数值型数据，产生另一种汇总统计
obj=pd.Series(['a','a','b','c']*4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [55]:
obj.describe()
#  共16个元素
#  元素abc三个
#  top最多的是a
#  frequency,重复的次数是8

count     16
unique     3
top        a
freq       8
dtype: object

## 5.3.1 Correlation and Covariance 相关性和协方差

In [3]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame  ## 常用的两个工具 
import pandas_datareader.data as web
all_data={ticker:web.get_data_yahoo(ticker)
         for ticker in ['AAPL','IBM','MSFT','GOOG']}
# 下载股票代号ticker为这几个的数据

#### 取出四只股票代码和价格，通过字典生成一个DataFrame
#### 其中字典的key值来自all_data里面key的值
#### value值是all_data里面DataFrame的一列
#### 这里用的是Adj Close

In [13]:
price=pd.DataFrame({ticker:data['Adj Close']
                   for ticker,data in all_data.items()})
price

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-10-12,25.782785,121.033043,42.713379,646.669983
2015-10-13,25.826679,119.815773,42.613407,652.299988
2015-10-14,25.461657,120.128113,42.422577,651.159973
2015-10-15,25.842854,120.192162,42.722469,661.739990
2015-10-16,25.653408,120.432404,43.176865,662.200012
2015-10-19,25.812820,119.495476,43.276833,666.099976
2015-10-20,26.284117,112.624611,43.413151,650.280029
2015-10-21,26.281809,112.848816,42.895134,642.609985
2015-10-22,26.683798,115.387383,43.649437,651.789978
2015-10-23,27.510878,115.859840,48.048008,702.000000


In [14]:
## 取出四只股票当日成交量
volume=pd.DataFrame({ticker:data['Volume']
                   for ticker,data in all_data.items()})
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-10-12,121868800.0,3227600.0,19769100.0,1275200
2015-10-13,132197200.0,3915700.0,19987800.0,1807700
2015-10-14,177849600.0,3353000.0,24697800.0,1415500
2015-10-15,150694000.0,3476200.0,27189400.0,1885700
2015-10-16,156930400.0,3483700.0,26450300.0,1611100
2015-10-19,119036800.0,7954700.0,29387600.0,1477300
2015-10-20,195871200.0,16025600.0,30802200.0,2498200
2015-10-21,167180800.0,6990600.0,25144300.0,1791100
2015-10-22,166616400.0,5583200.0,56637100.0,4071000
2015-10-23,237467600.0,5370400.0,135227100.0,6653900


#### 计算股票变动百分比

In [6]:
returns=price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-10-05,0.030791,0.011943,0.020321,0.018925
2020-10-06,-0.028669,-0.000328,-0.021247,-0.021924
2020-10-07,0.016967,0.017217,0.019037,0.004713
2020-10-08,-0.000956,0.059805,0.003574,0.017558
2020-10-09,0.017396,-0.028139,0.024836,0.019712


#### 两列的相关性

In [15]:
returns['MSFT'].corr(returns['IBM'])
# 或者returns.MSFT.corr(returns.IBM)

0.5678029695586666

#### 两列的协方差

In [16]:
returns['MSFT'].cov(returns['IBM'])

0.00016064729517552268

#### 返回四只股票价格变动之间的相关性矩阵

In [18]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.493234,0.71047,0.65728
IBM,0.493234,1.0,0.567803,0.531809
MSFT,0.71047,0.567803,1.0,0.784208
GOOG,0.65728,0.531809,0.784208,1.0


#### 返回四只股票的协方差矩阵

In [19]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000352,0.000149,0.000234,0.000204
IBM,0.000149,0.000259,0.000161,0.000142
MSFT,0.000234,0.000161,0.000308,0.000228
GOOG,0.000204,0.000142,0.000228,0.000274


#### 想要每只股票与IBM股价的相关性

In [20]:
returns.corrwith(returns.IBM)

AAPL    0.493234
IBM     1.000000
MSFT    0.567803
GOOG    0.531809
dtype: float64

#### 成交量与价格变化百分比的相关性

In [21]:
returns.corrwith(volume)

AAPL   -0.100957
IBM    -0.076054
MSFT   -0.056606
GOOG   -0.152096
dtype: float64

#### High当日最高价
#### Low当日最低价
#### Open当日开盘价
#### Close当日收盘价
#### Volume当日成交量
#### Adj Close 前复权收盘价，为了更准确反应股票的价值，针对分红影响进行的调整

In [12]:
all_data

{'AAPL':                   High         Low        Open       Close       Volume  \
 Date                                                                      
 2015-10-12   28.187500   27.860001   28.182501   27.900000  121868800.0   
 2015-10-13   28.112499   27.670000   27.705000   27.947500  132197200.0   
 2015-10-14   27.879999   27.389999   27.822500   27.552500  177849600.0   
 2015-10-15   28.025000   27.622499   27.732500   27.965000  150694000.0   
 2015-10-16   28.000000   27.632500   27.945000   27.760000  156930400.0   
 2015-10-19   27.937500   27.527500   27.700001   27.932501  119036800.0   
 2015-10-20   28.542500   27.705000   27.834999   28.442499  195871200.0   
 2015-10-21   28.895000   28.424999   28.500000   28.440001  167180800.0   
 2015-10-22   28.875000   28.525000   28.582500   28.875000  166616400.0   
 2015-10-23   29.807501   29.082500   29.174999   29.770000  237467600.0   
 2015-10-26   29.532499   28.730000   29.520000   28.820000  265335200.0   
 201

#### 这个数据本身是一个大字典

In [9]:
print(type(all_data))

<class 'dict'>


#### 字典的key值作为股票代号，value值是一个DataFrame

In [10]:
print(type(all_data['AAPL']))

<class 'pandas.core.frame.DataFrame'>


In [11]:
# 取出DataFrame里面的一个列Series
all_data['AAPL']['Adj Close']

Date
2015-10-12     25.782785
2015-10-13     25.826679
2015-10-14     25.461657
2015-10-15     25.842854
2015-10-16     25.653408
2015-10-19     25.812820
2015-10-20     26.284117
2015-10-21     26.281809
2015-10-22     26.683798
2015-10-23     27.510878
2015-10-26     26.632969
2015-10-27     26.464319
2015-10-28     27.554773
2015-10-29     27.845867
2015-10-30     27.607912
2015-11-02     27.996040
2015-11-03     28.317165
2015-11-04     28.185480
2015-11-05     28.055548
2015-11-06     28.088028
2015-11-09     27.974346
2015-11-10     27.092672
2015-11-11     26.939545
2015-11-12     26.849056
2015-11-13     26.064838
2015-11-16     26.491756
2015-11-17     26.378065
2015-11-18     27.213324
2015-11-19     27.559031
2015-11-20     27.679680
                 ...    
2020-08-28    124.807503
2020-08-31    129.039993
2020-09-01    134.179993
2020-09-02    131.399994
2020-09-03    120.879997
2020-09-04    120.959999
2020-09-08    112.820000
2020-09-09    117.320000
2020-09-10    113.48

## 5.3.2 Unique Values,Value Counts,and Membership 唯一值，计数和成员属性

In [23]:
obj=pd.Series(['c','a','d','a','a','b','b','c','c'])
uniques=obj.unique()
# 去掉Series里面的重复值，每个元素留一个
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [24]:
obj.value_counts()
# 计算包含值的个数，默认按数量由多到少排序

c    3
a    3
b    2
d    1
dtype: int64

In [25]:
pd.value_counts(obj.values,sort=False)

a    3
d    1
b    2
c    3
dtype: int64

In [26]:
# 判断，判断值是不是在这个位置
mask=obj.isin(['b','c'])
mask  # 返回布尔值

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [28]:
obj[mask]  # 用布尔值索引，把b，c提出来

0    c
5    b
6    b
7    c
8    c
dtype: object

In [2]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame  ## 常用的两个工具 
import pandas_datareader.data as web
to_match=pd.Series(['c','a','b','b','c','a','e'])
to_match

0    c
1    a
2    b
3    b
4    c
5    a
6    e
dtype: object

In [3]:
unique_vals=pd.Series(['c','b','a'])
unique_vals

0    c
1    b
2    a
dtype: object

### Index.get_indexer方法，可以提供一个索引数组，这个索引数组可以将可能非唯一值数组转换成另一个唯一的数组

In [4]:
pd.Index(unique_vals).get_indexer(to_match)

array([ 0,  2,  1,  1,  0,  2, -1], dtype=int64)

In [5]:
l=list(unique_vals)
for x in to_match:
    if x in l:
        print(l.index(x))
    else:
        print(-1)

0
2
1
1
0
2
-1


###  0 to_match[0]为c在unique_vals索引为0
###  2 to_match[1]为a在unique_vals索引为2
###  1 to_match[0]为b在unique_vals索引为1
###  1 to_match[0]为b在unique_vals索引为1
###  0 to_match[0]为c在unique_vals索引为0
###  2 to_match[0]为a在unique_vals索引为2
###  要是元素不存在 比如unique_vals=pd.Series(['e'])索引就为-1

In [7]:
# 计算多个列的直方图
data=pd.DataFrame({'Qu1':[1,3,4,3,4],
                  'Qu2':[2,3,1,2,3],
                  'Qu3':[1,5,2,4,4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [9]:
#  行标签，12345是各种不同的值
#  对应的值则是在某列出现的次数
#  如，1在Qu1里面出现了1次
result=data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
