In [1]:
%pylab inline
from pandas import Series, DataFrame
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


《[利用Python进行数据分析](https://book.douban.com/subject/25779298/)》
作者: Wes McKinney
译者: 唐学韬

汇总和描述性统计

Pandas 实现了一些简单简单的汇总和描述性统计功能，更复杂的统计计算可以使用 [Scipy]() 的 `stats`模块。

# 基本统计

包括：

- count: 计数
- describe: 各列的汇总统计
- min, max: 最小值和最大值
- argmin, argmax: 最小值和最大值的索引位置
- idxmin, idxmax: 最小值和最大值 的索引值
- quantile: 样本的分位数(0--1)
- sum: 求和
- mean: 平均数
- median: 中位数（0.5分位数）
- mad: 平均绝对离差
- var: 方差
- std: 标准差
- skew: 偏度（三阶矩）
- kurt: 峰度（四阶矩）
- cumsum: 累计和
- cummin, cummax: 累计最小、最大值
- cumprod: 累计积
- diff: 一阶差分（用于时间序列）
- pct_change: 百分数变化

In [2]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
# 求和
df.sum()

one    9.25
two   -5.80
dtype: float64

In [5]:
#  按行求和
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [6]:
# 不忽略 NA
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [9]:
# 返回索引
df.idxmax()

one    b
two    d
dtype: object

In [10]:
# 累加型
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [11]:
# 描述
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [12]:
# 非数值型的描述
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

# 相关系数(Correlation)与协方差(covariance)

In [26]:
from pandas_datareader import data, wb

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = data.get_data_yahoo(ticker)

price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.items()})

In [27]:
price

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.727039,313.062468,111.405000,25.555485
2010-01-05,27.774976,311.683844,110.059232,25.563741
2010-01-06,27.333178,303.826685,109.344283,25.406859
2010-01-07,27.282650,296.753749,108.965786,25.142634
2010-01-08,27.464034,300.709808,110.059232,25.316031
2010-01-11,27.221758,300.255255,108.906903,24.994007
2010-01-12,26.912110,294.945572,109.773245,24.828866
2010-01-13,27.291720,293.252243,109.537735,25.060064
2010-01-14,27.133657,294.630868,111.287245,25.563741
2010-01-15,26.680198,289.710772,110.841458,25.481172


In [28]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-02-17,0.002734,0.004744,-0.004189,0.00155
2017-02-21,0.007221,0.004335,-0.002269,-0.002012
2017-02-22,0.002999,-0.001082,0.004937,-0.002016
2017-02-23,-0.00423,0.000686,0.00276,0.00404
2017-02-24,0.000952,-0.003236,-0.001651,0.0


In [39]:
# 相关系数
print(returns.MSFT.corr(returns.IBM))
# 协方差
print(returns.MSFT.cov(returns.IBM))

0.495166900181
8.58872503835e-05


In [40]:
#  相关系数矩阵
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.409523,0.381495,0.388913
GOOG,0.409523,1.0,0.40288,0.47081
IBM,0.381495,0.40288,1.0,0.495167
MSFT,0.388913,0.47081,0.495167,1.0


In [41]:
# 协方差矩阵
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000269,0.000105,7.5e-05,9.2e-05
GOOG,0.000105,0.000244,7.5e-05,0.000106
IBM,7.5e-05,7.5e-05,0.000144,8.6e-05
MSFT,9.2e-05,0.000106,8.6e-05,0.000209


In [44]:
# 一组相关系数
returns.corrwith(returns.IBM)

AAPL    0.381495
GOOG    0.402880
IBM     1.000000
MSFT    0.495167
dtype: float64

In [45]:
# 一组相关系数( 回报百分比与成交量之间)
returns.corrwith(volume)

AAPL   -0.074047
GOOG   -0.009542
IBM    -0.194405
MSFT   -0.091078
dtype: float64

# 唯一值，值计数和成员资格

In [46]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [47]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [48]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [49]:
pd.value_counts(obj.values, sort=False)

b    2
d    1
c    3
a    3
dtype: int64

In [50]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [54]:
# 通过 `isin()`的结果进行筛选
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [52]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [55]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
