# 数值计算和统计

In [2]:
import numpy as np
import pandas as pd                                       

# 主要数学计算方法，可用于Series和DataFrame

df = pd.DataFrame({'key1':np.arange(10),'key2':np.random.rand(10)*10})
print(df)

print('count统计非NaN值的数量')
print(df.count())
print('min统计最小值,max统计最大值\n')
print(df.min(),df['key2'].max())
print('qunantile统计分位数，参数q确定位置\n')
print(df.quantile(q=0.75))
print('sum求和')
print(df.sum())
print('mean求平均值')
print(df.mean())
print('nedian求算数中位数')
print(df.median())
print('std,var求标准差\n')
print(df.std(),df.var())
print('skew样本的偏度')
print(df.skew())
print('kurt样本的峰度')
print(df.kurt())

   key1      key2
0     0  8.440076
1     1  5.701456
2     2  1.166046
3     3  8.455489
4     4  6.668064
5     5  1.784346
6     6  9.583742
7     7  2.047384
8     8  8.406314
9     9  4.016307
count统计非NaN值的数量
key1    10
key2    10
dtype: int64
min统计最小值,max统计最大值

key1    0.000000
key2    1.166046
dtype: float64 9.583742275779567
qunantile统计分位数，参数q确定位置

key1    6.750000
key2    8.431635
Name: 0.75, dtype: float64
sum求和
key1    45.000000
key2    56.269224
dtype: float64
mean求平均值
key1    4.500000
key2    5.626922
dtype: float64
nedian求算数中位数
key1    4.50000
key2    6.18476
dtype: float64
std,var求标准差

key1    3.027650
key2    3.170306
dtype: float64 key1     9.166667
key2    10.050842
dtype: float64
skew样本的偏度
key1    0.000000
key2   -0.288141
dtype: float64
kurt样本的峰度
key1   -1.200000
key2   -1.708807
dtype: float64


In [2]:
df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])

# mean：计算均值，只统计数字列，可以通过索引单独统计一列
m1 = df.mean()
print(m1,type(m1))
# 单独统计一列
print(df['key2'].mean())

# axis参数：默认为0，以列来计算，axis=1，以行来计算
m2 = df.mean(axis=1)
print(m2)

# skipna参数：是否忽略NaN，默认True，如果False，有NaN的列统计结果仍为NaN
m3 = df.mean(skipna=False)
print(m3)

key1    3.5
key2    3.0
dtype: float64 <class 'pandas.core.series.Series'>
3.0
a    2.5
b    3.5
c    3.0
d    4.0
e    3.5
dtype: float64
key1   NaN
key2   NaN
dtype: float64


In [5]:
# cummax,cummin分别求累计最大值，累计最小值

df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df,'→ cumsum样本的累计和\n')

df['key1_p'] = df['key1'].cumprod()
df['key2_p'] = df['key2'].cumprod()
print(df,'→ cumprod样本的累计积\n')

print(df.cummax(),'\n',df.cummin(),'→ cummax,cummin分别求累计最大值，累计最小值\n')
# 会填充key1，和key2的值

   key1      key2  key1_s     key2_s
0     0  9.150646       0   9.150646
1     1  5.919352       1  15.069998
2     2  0.323754       3  15.393752
3     3  0.972282       6  16.366034
4     4  5.130167      10  21.496201
5     5  8.455466      15  29.951668
6     6  8.498757      21  38.450424
7     7  5.481035      28  43.931460
8     8  5.554689      36  49.486149
9     9  9.836869      45  59.323018 → cumsum样本的累计和

   key1      key2  key1_s     key2_s  key1_p        key2_p
0     0  9.150646       0   9.150646       0  9.150646e+00
1     1  5.919352       1  15.069998       0  5.416590e+01
2     2  0.323754       3  15.393752       0  1.753641e+01
3     3  0.972282       6  16.366034       0  1.705033e+01
4     4  5.130167      10  21.496201       0  8.747107e+01
5     5  8.455466      15  29.951668       0  7.396087e+02
6     6  8.498757      21  38.450424       0  6.285755e+03
7     7  5.481035      28  43.931460       0  3.445244e+04
8     8  5.554689      36  49.486149       0  

In [6]:
# 唯一值：.unique()
# 得到一个唯一的值数组
# 通过pd.Series重新变成新的Series
s = pd.Series(list('asdvasdcfgg'))
sq = s.unique()
print(s)
print(sq,type(sq))
print(pd.Series(sq))

sq.sort()
print(sq)

0     a
1     s
2     d
3     v
4     a
5     s
6     d
7     c
8     f
9     g
10    g
dtype: object
['a' 's' 'd' 'v' 'c' 'f' 'g'] <class 'numpy.ndarray'>
0    a
1    s
2    d
3    v
4    c
5    f
6    g
dtype: object
['a' 'c' 'd' 'f' 'g' 's' 'v']


In [7]:
# 值计数：.value_counts()
# 得到一个新的Series，计算出不同值出现的频率
# sort参数：排序，默认为True
sc = s.value_counts(sort=False) # 也可以按照下面的方式写：pd.value_counts(sc,sort=False)
print(sc)

v    1
d    2
a    2
s    2
f    1
c    1
g    2
dtype: int64


In [8]:
# 成员资格：.isin()
s = pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),'key2':np.arange(4,13)})
print(s)
print(df)

# 符合某一个列表中的任意值都可以返回
print(s.isin([5,14]))
print(df.isin(['a','bc','10',8]))

0    10
1    11
2    12
3    13
4    14
dtype: int32
  key1  key2
0    a     4
1    s     5
2    d     6
3    c     7
4    b     8
5    v     9
6    a    10
7    s    11
8    d    12
0    False
1    False
2    False
3    False
4     True
dtype: bool
    key1   key2
0   True  False
1  False  False
2  False  False
3  False  False
4  False   True
5  False  False
6   True  False
7  False  False
8  False  False


## 对每个元素进行字符串操作

In [3]:
# 通过str调用字符串方法
# 可以对Series、DataFrame使用，自动过滤NaN值
s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
                  'key2':['hee','fv','w','hija','123',np.nan]})
print(s)
print(df)

print(s.str.count('b'))
print(df['key2'].str.upper())
df.columns = df.columns.str.upper()
print(df)

0          A
1          b
2          C
3    bbhello
4        123
5        NaN
6         hj
dtype: object
  key1  key2
0    a   hee
1    b    fv
2    c     w
3    d  hija
4    e   123
5    f   NaN
0    0.0
1    1.0
2    0.0
3    2.0
4    0.0
5    NaN
6    0.0
dtype: float64
0     HEE
1      FV
2       W
3    HIJA
4     123
5     NaN
Name: key2, dtype: object
  KEY1  KEY2
0    a   hee
1    b    fv
2    c     w
3    d  hija
4    e   123
5    f   NaN


In [None]:
# 字符串常用方法
s = pd.Series(['A','b','bbhello','123',np.nan])
print(s.str.lower(),'lower小写')
print(s.str.upper(),'upper大写')
print(s.str.len(),'len字符长度')
print(s.str.startswith('b'),'判断起始是否为a')
print(s.str.endswith('3'),'判断结束是否为3')
print(s.str.strip(),'去除字符串中的空格')
print(s.str.lstrip(),'去除字符串中的左空格')
print(s.str.rstrip(),'去除字符串中的右空格')
print(s.columns.str.replace('a','xx',n=1),'替换') # 替换，n为替换个数
print(s.str.split(','),'按照,拆分成数组')
print(s.str.split(',').str[0],s.str.split(',').str.get(1),'按照')
print(s.str[0]) # 取第一个字符串

In [None]:
# 合并：merge，join
pd.merge(left,right,how='inner',on=None,left_on=None,right_on=None,left_index=False,right_index=False,
         sort=True,suffixes=('_x','_y'),copy=True,indicator=False)
# left：第一个df
# right：第二个df
# on：参考链
# how的取值
    # inner：默认，取交集
    # outer：取并集，数据缺失范围NaN
    # left：按照第一个参数为参考合并，数据缺失范围NaN
    # right：按照第二个参数为参考合并，数据缺失范围NaN
# left_on,right_on,left_index,right_index：当键不为一个列时，可以单独设置左键与右键
# sort：按照字典顺序通过链接键，对结果DataFrame进行排序，默认为False，设置False会大幅提高性能

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
print(left)
print(right)
print(left.join(right))
print(left.join(right, how='outer'))

In [None]:
# 连接与修补 concat、combine_first
# 连接 - 沿轴执行连接操作
pd.concat(obj,axis=0,join='outer',join_axes=None,ignore_index=False,keys=None,levels=None,names=None,verify_integrity=False,copy=True)