In [2]:
import numpy as np
import pandas as pd
"""
考虑一下Series的isin()方法，它将返回一个布尔向量，该向量在传递的列表中存在Series元素的任何地方都为真。这允许您选择具有所需值的一列或多列的行
"""

s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s,s.isin([2, 4, 6]),\
s[s.isin([2, 4, 6])],s[s.index.isin([2, 4, 6])],\
    s.reindex([2, 4, 6])

(4    0
 3    1
 2    2
 1    3
 0    4
 dtype: int64,
 4    False
 3    False
 2     True
 1    False
 0     True
 dtype: bool,
 2    2
 0    4
 dtype: int64,
 4    0
 2    2
 dtype: int64,
 2    2.0
 4    0.0
 6    NaN
 dtype: float64)

In [3]:
s_mi = pd.Series(np.arange(6),
                index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi,\
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])],\
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

(0  a    0
    b    1
    c    2
 1  a    3
    b    4
    c    5
 dtype: int32,
 0  c    2
 1  a    3
 dtype: int32,
 0  a    0
    c    2
 1  a    3
    c    5
 dtype: int32)

In [8]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
                   'ids2': ['a', 'n', 'c', 'n']})
values = ['a', 'b', 1, 3]
values2 = {'ids': ['a', 'b'], 'vals': [1, 3]}
values3 = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
row_mask = df.isin(values3).all(1)

df,\
df.isin(values),\
df.isin(values2),\
row_mask,df[row_mask]

(   vals ids ids2
 0     1   a    a
 1     2   b    n
 2     3   f    c
 3     4   n    n,
     vals    ids   ids2
 0   True   True   True
 1  False   True  False
 2   True  False  False
 3  False  False  False,
     vals    ids   ids2
 0   True   True  False
 1  False   True  False
 2   True  False  False
 3  False  False  False,
 0     True
 1    False
 2    False
 3    False
 dtype: bool,
    vals ids ids2
 0     1   a    a)

In [10]:
"""
从具有布尔向量的序列中选择值通常返回数据的一个子集。
为了保证选择输出与原始数据具有相同的形状，可以使用Series和DataFrame中的where方法。
"""
# 只返回选定的行
s,s[s > 0]

(4    0
 3    1
 2    2
 1    3
 0    4
 dtype: int64,
 3    1
 2    2
 1    3
 0    4
 dtype: int64)

In [11]:
# 返回与原来形状相同的Series
s.where(s > 0)


4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [15]:
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4),
                 index=dates, columns=['A', 'B', 'C', 'D'])

# 两者相同
df,df[df < 0],df.where(df < 0)


(                   A         B         C         D
 2000-01-01 -1.088828 -1.421502 -0.154838 -1.066516
 2000-01-02  0.292622 -0.359705  1.212527 -0.633952
 2000-01-03 -0.107601 -0.595376  0.540223 -0.112642
 2000-01-04  0.511914  0.495642 -0.434515  1.569236
 2000-01-05  0.002502  2.895359 -0.434244  0.619257
 2000-01-06 -1.388348 -0.136133  0.207910  0.099768
 2000-01-07 -2.137419  0.908968 -0.859641 -0.564609
 2000-01-08 -0.218836  0.557736  0.709825 -0.179393,
                    A         B         C         D
 2000-01-01 -1.088828 -1.421502 -0.154838 -1.066516
 2000-01-02       NaN -0.359705       NaN -0.633952
 2000-01-03 -0.107601 -0.595376       NaN -0.112642
 2000-01-04       NaN       NaN -0.434515       NaN
 2000-01-05       NaN       NaN -0.434244       NaN
 2000-01-06 -1.388348 -0.136133       NaN       NaN
 2000-01-07 -2.137419       NaN -0.859641 -0.564609
 2000-01-08 -0.218836       NaN       NaN -0.179393,
                    A         B         C         D
 2000-01-0

In [16]:
# 此外，where接受一个可选的其他参数，用于替换返回副本中条件为假的值。
df.where(df < 0, -df)


Unnamed: 0,A,B,C,D
2000-01-01,-1.088828,-1.421502,-0.154838,-1.066516
2000-01-02,-0.292622,-0.359705,-1.212527,-0.633952
2000-01-03,-0.107601,-0.595376,-0.540223,-0.112642
2000-01-04,-0.511914,-0.495642,-0.434515,-1.569236
2000-01-05,-0.002502,-2.895359,-0.434244,-0.619257
2000-01-06,-1.388348,-0.136133,-0.20791,-0.099768
2000-01-07,-2.137419,-0.908968,-0.859641,-0.564609
2000-01-08,-0.218836,-0.557736,-0.709825,-0.179393


In [17]:
s2 = s.copy()
s2[s2 < 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [18]:
df2 = df.copy()
df2[df2 < 0] = 0
df2


Unnamed: 0,A,B,C,D
2000-01-01,0.0,0.0,0.0,0.0
2000-01-02,0.292622,0.0,1.212527,0.0
2000-01-03,0.0,0.0,0.540223,0.0
2000-01-04,0.511914,0.495642,0.0,1.569236
2000-01-05,0.002502,2.895359,0.0,0.619257
2000-01-06,0.0,0.0,0.20791,0.099768
2000-01-07,0.0,0.908968,0.0,0.0
2000-01-08,0.0,0.557736,0.709825,0.0


In [19]:
df_orig = df.copy()
# inplace=True 这里有一个可选参数，因此可以修改原始数据而不创建副本
df_orig.where(df > 0, -df, inplace=True)
df_orig

Unnamed: 0,A,B,C,D
2000-01-01,1.088828,1.421502,0.154838,1.066516
2000-01-02,0.292622,0.359705,1.212527,0.633952
2000-01-03,0.107601,0.595376,0.540223,0.112642
2000-01-04,0.511914,0.495642,0.434515,1.569236
2000-01-05,0.002502,2.895359,0.434244,0.619257
2000-01-06,1.388348,0.136133,0.20791,0.099768
2000-01-07,2.137419,0.908968,0.859641,0.564609
2000-01-08,0.218836,0.557736,0.709825,0.179393


In [20]:
df.where(df < 0, -df) == np.where(df < 0, df, -df)



Unnamed: 0,A,B,C,D
2000-01-01,True,True,True,True
2000-01-02,True,True,True,True
2000-01-03,True,True,True,True
2000-01-04,True,True,True,True
2000-01-05,True,True,True,True
2000-01-06,True,True,True,True
2000-01-07,True,True,True,True
2000-01-08,True,True,True,True


In [21]:
df2 = df.copy()
df2[df2[1:4] > 0] = 3
df2

Unnamed: 0,A,B,C,D
2000-01-01,-1.088828,-1.421502,-0.154838,-1.066516
2000-01-02,3.0,-0.359705,3.0,-0.633952
2000-01-03,-0.107601,-0.595376,3.0,-0.112642
2000-01-04,3.0,3.0,-0.434515,3.0
2000-01-05,0.002502,2.895359,-0.434244,0.619257
2000-01-06,-1.388348,-0.136133,0.20791,0.099768
2000-01-07,-2.137419,0.908968,-0.859641,-0.564609
2000-01-08,-0.218836,0.557736,0.709825,-0.179393


In [23]:
df2 = df.copy()
# 第一个等效于第二个（但比第二个速度更快）
df2.where(df2 > 0, df2['A'], axis='index'),\
     df.apply(lambda x, y: x.where(x > 0, y), y=df['A'])


(                   A         B         C         D
 2000-01-01 -1.088828 -1.088828 -1.088828 -1.088828
 2000-01-02  0.292622  0.292622  1.212527  0.292622
 2000-01-03 -0.107601 -0.107601  0.540223 -0.107601
 2000-01-04  0.511914  0.495642  0.511914  1.569236
 2000-01-05  0.002502  2.895359  0.002502  0.619257
 2000-01-06 -1.388348 -1.388348  0.207910  0.099768
 2000-01-07 -2.137419  0.908968 -2.137419 -2.137419
 2000-01-08 -0.218836  0.557736  0.709825 -0.218836,
                    A         B         C         D
 2000-01-01 -1.088828 -1.088828 -1.088828 -1.088828
 2000-01-02  0.292622  0.292622  1.212527  0.292622
 2000-01-03 -0.107601 -0.107601  0.540223 -0.107601
 2000-01-04  0.511914  0.495642  0.511914  1.569236
 2000-01-05  0.002502  2.895359  0.002502  0.619257
 2000-01-06 -1.388348 -1.388348  0.207910  0.099768
 2000-01-07 -2.137419  0.908968 -2.137419 -2.137419
 2000-01-08 -0.218836  0.557736  0.709825 -0.218836)

In [27]:
df3 = pd.DataFrame({'A': [1, 2, 3],
                     'B': [4, 5, 6],
                     'C': [7, 8, 9]})
df3,df3.where(lambda x: x > 4, lambda x: x + 10)


(   A  B  C
 0  1  4  7
 1  2  5  8
 2  3  6  9,
     A   B  C
 0  11  14  7
 1  12   5  8
 2  13   6  9)

In [28]:
# mask（）是where的反布尔运算。
s.mask(s >= 0)


4   NaN
3   NaN
2   NaN
1   NaN
0   NaN
dtype: float64

In [29]:
df.mask(df >= 0)



Unnamed: 0,A,B,C,D
2000-01-01,-1.088828,-1.421502,-0.154838,-1.066516
2000-01-02,,-0.359705,,-0.633952
2000-01-03,-0.107601,-0.595376,,-0.112642
2000-01-04,,,-0.434515,
2000-01-05,,,-0.434244,
2000-01-06,-1.388348,-0.136133,,
2000-01-07,-2.137419,,-0.859641,-0.564609
2000-01-08,-0.218836,,,-0.179393


In [30]:
#The query() Method
n = 10
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df,df[(df['a'] < df['b']) & (df['b'] < df['c'])],df.query('(a < b) & (b < c)')

(          a         b         c
 0  0.188961  0.580085  0.467895
 1  0.625883  0.945439  0.795369
 2  0.563109  0.438030  0.785009
 3  0.559239  0.944148  0.845194
 4  0.251760  0.520442  0.665635
 5  0.167815  0.121067  0.330215
 6  0.443791  0.842843  0.358262
 7  0.003806  0.009599  0.869812
 8  0.058138  0.356462  0.873821
 9  0.674502  0.740210  0.937600,
           a         b         c
 4  0.251760  0.520442  0.665635
 7  0.003806  0.009599  0.869812
 8  0.058138  0.356462  0.873821
 9  0.674502  0.740210  0.937600,
           a         b         c
 4  0.251760  0.520442  0.665635
 7  0.003806  0.009599  0.869812
 8  0.058138  0.356462  0.873821
 9  0.674502  0.740210  0.937600)

In [33]:
# 执行相同的操作，但如果没有名为a的列，则返回到指定索引。
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
print(df)
df.index.name = 'a' # uses the column 'a', not the index
df, \
df.query('a < b and b < c')

   b  c
0  0  1
1  3  2
2  3  4
3  4  4
4  0  1
5  1  3
6  4  3
7  3  2
8  0  0
9  0  3


(   b  c
 a      
 0  0  1
 1  3  2
 2  3  4
 3  4  4
 4  0  1
 5  1  3
 6  4  3
 7  3  2
 8  0  0
 9  0  3,
    b  c
 a      
 2  3  4)

In [36]:
df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc'))
df,df.query('index < b < c')


(   b  c
 0  3  9
 1  4  7
 2  5  9
 3  7  5
 4  1  4
 5  6  1
 6  8  3
 7  3  1
 8  8  1
 9  4  3,
    b  c
 0  3  9
 1  4  7
 2  5  9)

In [37]:
# MultiIndex query（）语法
n = 10
colors = np.random.choice(['red', 'green'], size=n)
foods = np.random.choice(['eggs', 'ham'], size=n)
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
df = pd.DataFrame(np.random.randn(n, 2), index=index)

colors,foods,index,df,df.query('color == "red"')


(array(['red', 'red', 'green', 'green', 'red', 'red', 'green', 'green',
        'red', 'red'], dtype='<U5'),
 array(['eggs', 'ham', 'eggs', 'eggs', 'ham', 'eggs', 'eggs', 'eggs',
        'ham', 'ham'], dtype='<U4'),
 MultiIndex([(  'red', 'eggs'),
             (  'red',  'ham'),
             ('green', 'eggs'),
             ('green', 'eggs'),
             (  'red',  'ham'),
             (  'red', 'eggs'),
             ('green', 'eggs'),
             ('green', 'eggs'),
             (  'red',  'ham'),
             (  'red',  'ham')],
            names=['color', 'food']),
                    0         1
 color food                    
 red   eggs  0.545959  0.516226
       ham   1.318063  0.713923
 green eggs -0.686173  1.184586
       eggs  0.640800  1.059334
 red   ham   1.182412 -1.931022
       eggs -0.162802  1.184333
 green eggs -0.749821 -0.050220
       eggs -0.507970  1.655366
 red   ham   0.210910  0.060506
       ham   0.302264 -0.938705,
                    0         1
 color f

In [38]:
# index没有name，使用ilevel_0 ilevel_1等
df.index.names = [None, None]
df, df.query('ilevel_0 == "red"')


(                   0         1
 red   eggs  0.545959  0.516226
       ham   1.318063  0.713923
 green eggs -0.686173  1.184586
       eggs  0.640800  1.059334
 red   ham   1.182412 -1.931022
       eggs -0.162802  1.184333
 green eggs -0.749821 -0.050220
       eggs -0.507970  1.655366
 red   ham   0.210910  0.060506
       ham   0.302264 -0.938705,
                  0         1
 red eggs  0.545959  0.516226
     ham   1.318063  0.713923
     ham   1.182412 -1.931022
     eggs -0.162802  1.184333
     ham   0.210910  0.060506
     ham   0.302264 -0.938705)

In [39]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)
expr = '0.0 <= a <= c <= 0.5'

df,df2,\
map(lambda frame: frame.query(expr), [df, df2])

(          a         b         c
 0  0.484522  0.148045  0.551847
 1  0.370875  0.144049  0.376943
 2  0.865422  0.501515  0.862846
 3  0.567714  0.635281  0.409959
 4  0.334291  0.693971  0.030076
 5  0.596303  0.410034  0.438923
 6  0.046434  0.269078  0.916983
 7  0.905927  0.817323  0.472700
 8  0.047141  0.102626  0.296656
 9  0.219290  0.215906  0.255722,
            a         b         c
 0   0.495682  0.936898  0.341333
 1   0.498846  0.563560  0.140879
 2   0.113629  0.794990  0.760406
 3   0.223591  0.448480  0.294789
 4   0.435022  0.990936  0.891826
 5   0.720842  0.767327  0.509437
 6   0.954165  0.056174  0.629083
 7   0.999502  0.012217  0.692507
 8   0.467944  0.753219  0.858273
 9   0.886954  0.883437  0.428652
 10  0.509704  0.361581  0.050954
 11  0.120882  0.129196  0.697127,
 <map at 0x247c83ef208>)

In [41]:
# query（）Python与Pandas语法比较
df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))
df,df.query('(a < b) & (b < c)'),\
df[(df['a'] < df['b']) & (df['b'] < df['c'])],\
df.query('a < b & b < c'),\
df.query('a < b and b < c'),\
df.query('a < b < c')

(   a  b  c
 0  7  9  5
 1  1  3  7
 2  8  4  2
 3  2  4  0
 4  6  0  8
 5  3  9  2
 6  7  6  9
 7  2  6  8
 8  7  4  6
 9  6  5  3,
    a  b  c
 1  1  3  7
 7  2  6  8,
    a  b  c
 1  1  3  7
 7  2  6  8,
    a  b  c
 1  1  3  7
 7  2  6  8,
    a  b  c
 1  1  3  7
 7  2  6  8,
    a  b  c
 1  1  3  7
 7  2  6  8)

In [42]:
# in和not in运算符
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                   'c': np.random.randint(5, size=12),
                   'd': np.random.randint(9, size=12)})

df,\
df.query('a in b'),\
df[df['a'].isin(df['b'])],\
df.query('a not in b'),\
df[~df['a'].isin(df['b'])]

(    a  b  c  d
 0   a  a  3  8
 1   a  a  0  4
 2   b  a  4  2
 3   b  a  0  2
 4   c  b  1  3
 5   c  b  3  7
 6   d  b  4  6
 7   d  b  1  4
 8   e  c  3  2
 9   e  c  4  2
 10  f  c  1  0
 11  f  c  0  3,
    a  b  c  d
 0  a  a  3  8
 1  a  a  0  4
 2  b  a  4  2
 3  b  a  0  2
 4  c  b  1  3
 5  c  b  3  7,
    a  b  c  d
 0  a  a  3  8
 1  a  a  0  4
 2  b  a  4  2
 3  b  a  0  2
 4  c  b  1  3
 5  c  b  3  7,
     a  b  c  d
 6   d  b  4  6
 7   d  b  1  4
 8   e  c  3  2
 9   e  c  4  2
 10  f  c  1  0
 11  f  c  0  3,
     a  b  c  d
 6   d  b  4  6
 7   d  b  1  4
 8   e  c  3  2
 9   e  c  4  2
 10  f  c  1  0
 11  f  c  0  3)

In [45]:
df.query('a in b and c < d'),\
df[df['b'].isin(df['a']) & (df['c'] < df['d'])]

(   a  b  c  d
 0  a  a  3  8
 1  a  a  0  4
 3  b  a  0  2
 4  c  b  1  3
 5  c  b  3  7,
     a  b  c  d
 0   a  a  3  8
 1   a  a  0  4
 3   b  a  0  2
 4   c  b  1  3
 5   c  b  3  7
 6   d  b  4  6
 7   d  b  1  4
 11  f  c  0  3)

In [48]:
# ==运算符与列表对象的特殊用法
df,\
df.query('b == ["a", "b", "c"]'),\
df[df['b'].isin(["a", "b", "c"])],\
df.query('c == [1, 2]'),\
df.query('c != [1, 2]'),\
df.query('[1, 2] in c'),\
df.query('[1, 2] not in c'),\
df[df['c'].isin([1, 2])]

(    a  b  c  d
 0   a  a  3  8
 1   a  a  0  4
 2   b  a  4  2
 3   b  a  0  2
 4   c  b  1  3
 5   c  b  3  7
 6   d  b  4  6
 7   d  b  1  4
 8   e  c  3  2
 9   e  c  4  2
 10  f  c  1  0
 11  f  c  0  3,
     a  b  c  d
 0   a  a  3  8
 1   a  a  0  4
 2   b  a  4  2
 3   b  a  0  2
 4   c  b  1  3
 5   c  b  3  7
 6   d  b  4  6
 7   d  b  1  4
 8   e  c  3  2
 9   e  c  4  2
 10  f  c  1  0
 11  f  c  0  3,
     a  b  c  d
 0   a  a  3  8
 1   a  a  0  4
 2   b  a  4  2
 3   b  a  0  2
 4   c  b  1  3
 5   c  b  3  7
 6   d  b  4  6
 7   d  b  1  4
 8   e  c  3  2
 9   e  c  4  2
 10  f  c  1  0
 11  f  c  0  3,
     a  b  c  d
 4   c  b  1  3
 7   d  b  1  4
 10  f  c  1  0,
     a  b  c  d
 0   a  a  3  8
 1   a  a  0  4
 2   b  a  4  2
 3   b  a  0  2
 5   c  b  3  7
 6   d  b  4  6
 8   e  c  3  2
 9   e  c  4  2
 11  f  c  0  3,
     a  b  c  d
 4   c  b  1  3
 7   d  b  1  4
 10  f  c  1  0,
     a  b  c  d
 0   a  a  3  8
 1   a  a  0  4
 2   b  a  4  2
 3   b  a  0  2
 5

In [51]:
# bool操作
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df['bools'] = np.random.rand(len(df)) > 0.5

df,\
df.query('~bools'),\
df.query('not bools'),\
df.query('not bools') == df[~df['bools']]

(          a         b         c  bools
 0  0.304504  0.142258  0.907189  False
 1  0.144163  0.318957  0.234655  False
 2  0.446940  0.483527  0.536835  False
 3  0.908021  0.530374  0.740780  False
 4  0.349748  0.227375  0.360031  False
 5  0.033267  0.266550  0.943582  False
 6  0.518616  0.629444  0.669138   True
 7  0.137744  0.116387  0.277332  False
 8  0.675560  0.956736  0.436307  False
 9  0.781926  0.717882  0.105526  False,
           a         b         c  bools
 0  0.304504  0.142258  0.907189  False
 1  0.144163  0.318957  0.234655  False
 2  0.446940  0.483527  0.536835  False
 3  0.908021  0.530374  0.740780  False
 4  0.349748  0.227375  0.360031  False
 5  0.033267  0.266550  0.943582  False
 7  0.137744  0.116387  0.277332  False
 8  0.675560  0.956736  0.436307  False
 9  0.781926  0.717882  0.105526  False,
           a         b         c  bools
 0  0.304504  0.142258  0.907189  False
 1  0.144163  0.318957  0.234655  False
 2  0.446940  0.483527  0.536835  Fals

In [52]:
shorter = df.query('a < b < c and (not bools) or bools > 2')
longer = df[(df['a'] < df['b'])
             & (df['b'] < df['c'])
             & (~df['bools'])
             | (df['bools'] > 2)]
shorter,longer,shorter == longer

# 对于大型框架，使用numexpr的DataFrame.query（）略快于Python。

(          a         b         c  bools
 2  0.446940  0.483527  0.536835  False
 5  0.033267  0.266550  0.943582  False,
           a         b         c  bools
 2  0.446940  0.483527  0.536835  False
 5  0.033267  0.266550  0.943582  False,
       a     b     c  bools
 2  True  True  True   True
 5  True  True  True   True)