In [2]:
# 重复数据
"""
如果要标识和删除DataFrame中的重复行，
则有两种方法会有所帮助：duplicated 和drop_duplicates。
每个参数都以用于标识重复行的列作为参数。
duplicated  返回一个布尔向量，其长度为行数，并指示一行是否被复制。
drop_duplicates  删除重复的行。

默认情况下，重复集的第一个观察行被认为是唯一的，但是每个方法都有一个keep参数来指定要保留的目标。
keep='first' (default): 标记/删除重复，除非第一次出现。

keep='last': 标记/删除重复项（最后一次除外）。

keep=False: 标记/删除所有重复项。
"""
import pandas as pd
import numpy as np
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                     'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                     'c': np.random.randn(7)})
df2


Unnamed: 0,a,b,c
0,one,x,-0.837853
1,one,y,-0.927378
2,two,x,-1.576388
3,two,y,0.221784
4,two,x,0.261542
5,three,x,-1.378436
6,four,x,0.302281


In [3]:
df2.duplicated('a')



0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [4]:
df2.duplicated('a', keep='last')



0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [5]:
df2.duplicated('a', keep=False)



0     True
1     True
2     True
3     True
4     True
5    False
6    False
dtype: bool

In [6]:
df2.drop_duplicates('a')



Unnamed: 0,a,b,c
0,one,x,-0.837853
2,two,x,-1.576388
5,three,x,-1.378436
6,four,x,0.302281


In [7]:
df2.drop_duplicates('a', keep='last')



Unnamed: 0,a,b,c
1,one,y,-0.927378
4,two,x,0.261542
5,three,x,-1.378436
6,four,x,0.302281


In [8]:
df2.drop_duplicates('a', keep=False)



Unnamed: 0,a,b,c
5,three,x,-1.378436
6,four,x,0.302281


In [9]:
df2.duplicated(['a', 'b'])



0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [10]:
df2.drop_duplicates(['a', 'b'])



Unnamed: 0,a,b,c
0,one,x,-0.837853
1,one,y,-0.927378
2,two,x,-1.576388
3,two,y,0.221784
5,three,x,-1.378436
6,four,x,0.302281


In [11]:
df3 = pd.DataFrame({'a': np.arange(6),
                     'b': np.random.randn(6)},
                    index=['a', 'a', 'b', 'c', 'b', 'a'])
df3,\
df3.index.duplicated(),\
df3[~df3.index.duplicated()],\
df3[~df3.index.duplicated(keep='last')],\
df3[~df3.index.duplicated(keep=False)]


(   a         b
 a  0  0.148541
 a  1  0.314334
 b  2  0.185043
 c  3  0.431622
 b  4 -0.540387
 a  5  0.575792,
 array([False,  True, False, False,  True,  True]),
    a         b
 a  0  0.148541
 b  2  0.185043
 c  3  0.431622,
    a         b
 c  3  0.431622
 b  4 -0.540387
 a  5  0.575792,
    a         b
 c  3  0.431622)

In [12]:
#类字典get()方法

s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s.get('a'),s.get('x', default=-1)

(1, -1)

In [14]:
# The lookup() method

dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D'])
ls = list(range(0, 10, 2))
dflookup,\
    ls,\
dflookup.lookup(ls, ['B', 'C', 'A', 'B', 'D'])

(           A         B         C         D
 0   0.365783  0.213540  0.559158  0.311500
 1   0.029847  0.240892  0.593806  0.328131
 2   0.467852  0.630774  0.356648  0.141014
 3   0.152653  0.424152  0.674738  0.820603
 4   0.189056  0.573216  0.676551  0.245686
 5   0.979267  0.825305  0.712291  0.157851
 6   0.310867  0.802896  0.110036  0.918953
 7   0.128436  0.404348  0.367253  0.989087
 8   0.854872  0.532358  0.601008  0.644373
 9   0.970769  0.099879  0.696755  0.184855
 10  0.255677  0.252582  0.395815  0.425303
 11  0.167196  0.541991  0.004629  0.683824
 12  0.493930  0.726377  0.745746  0.754956
 13  0.469153  0.339187  0.792630  0.745978
 14  0.228890  0.220855  0.243986  0.413406
 15  0.034518  0.304291  0.438455  0.428146
 16  0.685435  0.733894  0.758555  0.209567
 17  0.757278  0.559890  0.927517  0.904777
 18  0.808719  0.729581  0.442433  0.306668
 19  0.982712  0.870783  0.382494  0.007571,
 [0, 2, 4, 6, 8],
 array([0.2135404 , 0.35664816, 0.18905626, 0.80289581, 0

In [22]:
#Index objects
index = pd.Index(['e', 'd', 'a', 'b'])
index2 = pd.Index(['e', 'd', 'a', 'b'], dtype='object')
index3 = pd.Index(['e', 'd', 'a', 'b'], name='something')
print('d' in index,index3.name)

In [21]:
index = pd.Index(list(range(5)), name='rows')
columns = pd.Index(['A', 'B', 'C'], name='cols')
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)
df,df['A']


(cols         A         B         C
 rows                              
 0    -1.199344  0.384897 -1.240882
 1    -0.481623 -1.676249  1.773254
 2     1.257822  0.933209 -1.015521
 3    -0.507508 -1.996908  1.431405
 4     1.574604 -1.343136  0.699275,
 rows
 0   -1.199344
 1   -0.481623
 2    1.257822
 3   -0.507508
 4    1.574604
 Name: A, dtype: float64)

In [24]:
ind = pd.Index([1, 2, 3])
ind.rename("apple"),ind


(Int64Index([1, 2, 3], dtype='int64', name='apple'),
 Int64Index([1, 2, 3], dtype='int64'))

In [26]:
ind.set_names(["apple"], inplace=True)

ind


Int64Index([1, 2, 3], dtype='int64', name='apple')

In [27]:
ind.name = "bob"
ind


Int64Index([1, 2, 3], dtype='int64', name='bob')

In [28]:
# set_names, set_levels, and set_codes also take an optional level argument
index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index

MultiIndex([(0, 'one'),
            (0, 'two'),
            (1, 'one'),
            (1, 'two'),
            (2, 'one'),
            (2, 'two')],
           names=['first', 'second'])

In [29]:
index.levels[1]



Index(['one', 'two'], dtype='object', name='second')

In [30]:
index.set_levels(["a", "b"], level=1)



MultiIndex([(0, 'a'),
            (0, 'b'),
            (1, 'a'),
            (1, 'b'),
            (2, 'a'),
            (2, 'b')],
           names=['first', 'second'])

In [32]:
a = pd.Index(['c', 'b', 'a'])
b = pd.Index(['c', 'e', 'd'])
a | b,a & b,a.difference(b)

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'),
 Index(['c'], dtype='object'),
 Index(['a', 'b'], dtype='object'))

In [33]:
idx1 = pd.Index([1, 2, 3, 4])
idx2 = pd.Index([2, 3, 4, 5])
idx1.symmetric_difference(idx2),idx1 ^ idx2


(Int64Index([1, 5], dtype='int64'), Int64Index([1, 5], dtype='int64'))

In [34]:
idx1 = pd.Index([1, np.nan, 3, 4])
idx1,idx1.fillna(2)

(Float64Index([1.0, nan, 3.0, 4.0], dtype='float64'),
 Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64'))

In [35]:
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'),
                          pd.NaT,
                          pd.Timestamp('2011-01-03')])

idx2,idx2.fillna(pd.Timestamp('2011-01-02'))

(DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03'], dtype='datetime64[ns]', freq=None),
 DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq=None))

In [38]:
# Set / reset index
data  = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D'])

data


Unnamed: 0,A,B,C,D
0,0.704104,0.736158,0.916108,0.394503
1,0.92557,0.294871,0.816624,0.823994
2,0.360394,0.708637,0.793828,0.71619
3,0.990176,0.751028,0.536311,0.208417
4,0.866339,0.995987,0.373178,0.307267
5,0.43606,0.84047,0.957799,0.758362
6,0.933912,0.996639,0.343508,0.460799
7,0.340296,0.464141,0.401227,0.262922
8,0.57456,0.33643,0.280548,0.219491
9,0.354032,0.166113,0.296436,0.589116


In [39]:
indexed1 = data.set_index('C')
indexed1


Unnamed: 0_level_0,A,B,D
C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.916108,0.704104,0.736158,0.394503
0.816624,0.92557,0.294871,0.823994
0.793828,0.360394,0.708637,0.71619
0.536311,0.990176,0.751028,0.208417
0.373178,0.866339,0.995987,0.307267
0.957799,0.43606,0.84047,0.758362
0.343508,0.933912,0.996639,0.460799
0.401227,0.340296,0.464141,0.262922
0.280548,0.57456,0.33643,0.219491
0.296436,0.354032,0.166113,0.589116


In [40]:
indexed2 = data.set_index(['A', 'B'])
indexed2


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
0.704104,0.736158,0.916108,0.394503
0.92557,0.294871,0.816624,0.823994
0.360394,0.708637,0.793828,0.71619
0.990176,0.751028,0.536311,0.208417
0.866339,0.995987,0.373178,0.307267
0.43606,0.84047,0.957799,0.758362
0.933912,0.996639,0.343508,0.460799
0.340296,0.464141,0.401227,0.262922
0.57456,0.33643,0.280548,0.219491
0.354032,0.166113,0.296436,0.589116


In [43]:
frame1 = data.set_index('C', drop=False)
# append 相当于三个index
frame2 = frame1.set_index(['A', 'B'], append=True)
frame1,frame2

(                 A         B         C         D
 C                                               
 0.916108  0.704104  0.736158  0.916108  0.394503
 0.816624  0.925570  0.294871  0.816624  0.823994
 0.793828  0.360394  0.708637  0.793828  0.716190
 0.536311  0.990176  0.751028  0.536311  0.208417
 0.373178  0.866339  0.995987  0.373178  0.307267
 0.957799  0.436060  0.840470  0.957799  0.758362
 0.343508  0.933912  0.996639  0.343508  0.460799
 0.401227  0.340296  0.464141  0.401227  0.262922
 0.280548  0.574560  0.336430  0.280548  0.219491
 0.296436  0.354032  0.166113  0.296436  0.589116
 0.488706  0.083628  0.632132  0.488706  0.782122
 0.127868  0.302297  0.340238  0.127868  0.208864
 0.196307  0.544654  0.143744  0.196307  0.024907
 0.962019  0.916081  0.434728  0.962019  0.738299
 0.254395  0.528340  0.209723  0.254395  0.351243
 0.337707  0.469051  0.468194  0.337707  0.050327
 0.724459  0.823636  0.147923  0.724459  0.203944
 0.158303  0.826668  0.777286  0.158303  0.752866


In [44]:
data.set_index(['A', 'B'], inplace=True)
data


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
0.704104,0.736158,0.916108,0.394503
0.92557,0.294871,0.816624,0.823994
0.360394,0.708637,0.793828,0.71619
0.990176,0.751028,0.536311,0.208417
0.866339,0.995987,0.373178,0.307267
0.43606,0.84047,0.957799,0.758362
0.933912,0.996639,0.343508,0.460799
0.340296,0.464141,0.401227,0.262922
0.57456,0.33643,0.280548,0.219491
0.354032,0.166113,0.296436,0.589116


In [45]:
data.reset_index()



Unnamed: 0,A,B,C,D
0,0.704104,0.736158,0.916108,0.394503
1,0.92557,0.294871,0.816624,0.823994
2,0.360394,0.708637,0.793828,0.71619
3,0.990176,0.751028,0.536311,0.208417
4,0.866339,0.995987,0.373178,0.307267
5,0.43606,0.84047,0.957799,0.758362
6,0.933912,0.996639,0.343508,0.460799
7,0.340296,0.464141,0.401227,0.262922
8,0.57456,0.33643,0.280548,0.219491
9,0.354032,0.166113,0.296436,0.589116


In [47]:
frame2,frame2.reset_index(level=1)



(                                   C         D
 C        A        B                           
 0.916108 0.704104 0.736158  0.916108  0.394503
 0.816624 0.925570 0.294871  0.816624  0.823994
 0.793828 0.360394 0.708637  0.793828  0.716190
 0.536311 0.990176 0.751028  0.536311  0.208417
 0.373178 0.866339 0.995987  0.373178  0.307267
 0.957799 0.436060 0.840470  0.957799  0.758362
 0.343508 0.933912 0.996639  0.343508  0.460799
 0.401227 0.340296 0.464141  0.401227  0.262922
 0.280548 0.574560 0.336430  0.280548  0.219491
 0.296436 0.354032 0.166113  0.296436  0.589116
 0.488706 0.083628 0.632132  0.488706  0.782122
 0.127868 0.302297 0.340238  0.127868  0.208864
 0.196307 0.544654 0.143744  0.196307  0.024907
 0.962019 0.916081 0.434728  0.962019  0.738299
 0.254395 0.528340 0.209723  0.254395  0.351243
 0.337707 0.469051 0.468194  0.337707  0.050327
 0.724459 0.823636 0.147923  0.724459  0.203944
 0.158303 0.826668 0.777286  0.158303  0.752866
 0.613312 0.143555 0.745386  0.613312  0

In [50]:
# 返回视图和副本
# pd.set_option('mode.chained_assignment','raise')
# dfc.loc[0]['A'] = 1111  抛出异常
dfmi = pd.DataFrame([list('abcd'),
                     list('efgh'),
                     list('ijkl'),
                     list('mnop')],
                    columns=pd.MultiIndex.from_product([['one', 'two'],
                                                        ['first', 'second']]))
dfmi,dfmi['one']['second'],dfmi.loc[:, ('one', 'second')]



(    one          two       
   first second first second
 0     a      b     c      d
 1     e      f     g      h
 2     i      j     k      l
 3     m      n     o      p,
 0    b
 1    f
 2    j
 3    n
 Name: second, dtype: object,
 0    b
 1    f
 2    j
 3    n
 Name: (one, second), dtype: object)