# pandans的数据结构

## series

1. 通过list构建Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
ser_obj=pd.Series(range(11))
print(ser_obj)
ser2=pd.Series(range(11,16))
print(ser2.head(3))
print(type(ser2))

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
dtype: int64
0    11
1    12
2    13
dtype: int64
<class 'pandas.core.series.Series'>


2. 获取数据和索引

In [3]:
#获取数据，返回一个列表
print(ser_obj.values) 

[ 0  1  2  3  4  5  6  7  8  9 10]


In [4]:
#获取索引
print(ser_obj.index)

RangeIndex(start=0, stop=11, step=1)


3. 通过索引获取数据

In [5]:
print(ser_obj[5])
print(ser_obj[0])

5
0


4. 索引与数据的对应关系不被运算结果影响

In [6]:
print(ser_obj*3)
print(ser_obj>7)

0      0
1      3
2      6
3      9
4     12
5     15
6     18
7     21
8     24
9     27
10    30
dtype: int64
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8      True
9      True
10     True
dtype: bool


5. 通过dict构建Series

In [7]:
year_data={2001:15.2,2002:74.5,2003:45.2}
ser3=pd.Series(year_data)
print(ser3)

2001    15.2
2002    74.5
2003    45.2
dtype: float64


name属性

In [8]:
ser3.name="temp"
ser3.index.name="year"
print(ser3.head())

year
2001    15.2
2002    74.5
2003    45.2
Name: temp, dtype: float64


## DataFrame

1. 通过ndarray构建DataFrame

In [9]:
arr=np.random.randn(5,4)
print(arr)
dat=pd.DataFrame(arr)
print(dat.head())
print(dat)

[[ 0.06056102 -2.09707635 -1.53931997 -1.0461369 ]
 [ 0.76189205  1.57853707 -1.36036893  0.69654883]
 [-3.23545356  0.7385463   1.03762049  2.34025234]
 [ 0.10588288  0.14272749  1.34125327  0.61611027]
 [ 1.40707884 -0.04718134  0.51710994  0.67464822]]
          0         1         2         3
0  0.060561 -2.097076 -1.539320 -1.046137
1  0.761892  1.578537 -1.360369  0.696549
2 -3.235454  0.738546  1.037620  2.340252
3  0.105883  0.142727  1.341253  0.616110
4  1.407079 -0.047181  0.517110  0.674648
          0         1         2         3
0  0.060561 -2.097076 -1.539320 -1.046137
1  0.761892  1.578537 -1.360369  0.696549
2 -3.235454  0.738546  1.037620  2.340252
3  0.105883  0.142727  1.341253  0.616110
4  1.407079 -0.047181  0.517110  0.674648


2. 通过dict构建DataFrame

In [10]:
dict_data = {'A': 1, 
             'B': pd.Timestamp('20170426'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([3] * 4,dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'ITCast' }
dat1=pd.DataFrame(dict_data)
print(dat1)

   A          B    C  D       E       F
0  1 2017-04-26  1.0  3  Python  ITCast
1  1 2017-04-26  1.0  3    Java  ITCast
2  1 2017-04-26  1.0  3     C++  ITCast
3  1 2017-04-26  1.0  3       C  ITCast


3. 通过列索引获取列数据（Series类型）

In [11]:
print(dat1["B"])
print(dat1.B)  #.索引

0   2017-04-26
1   2017-04-26
2   2017-04-26
3   2017-04-26
Name: B, dtype: datetime64[ns]
0   2017-04-26
1   2017-04-26
2   2017-04-26
3   2017-04-26
Name: B, dtype: datetime64[ns]


4. 增加列数据

In [12]:
dat1["G"]=dat1["D"]+5
print(dat1.head())

   A          B    C  D       E       F  G
0  1 2017-04-26  1.0  3  Python  ITCast  8
1  1 2017-04-26  1.0  3    Java  ITCast  8
2  1 2017-04-26  1.0  3     C++  ITCast  8
3  1 2017-04-26  1.0  3       C  ITCast  8


5. 删除列

In [13]:
del(dat1['G'])
print(dat1)

   A          B    C  D       E       F
0  1 2017-04-26  1.0  3  Python  ITCast
1  1 2017-04-26  1.0  3    Java  ITCast
2  1 2017-04-26  1.0  3     C++  ITCast
3  1 2017-04-26  1.0  3       C  ITCast


# pandas的索引操作

## 索引对象Index

1. Series和DataFrame中的索引都是Index对象

In [14]:
print(ser2.index)
print(dat1.index)
print(dat1.columns)

RangeIndex(start=0, stop=5, step=1)
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


2. 索引对象不可变，保证了数据的安全

In [15]:
# 不可以更改索引对象，否则报错
dat1.index[1]=12 

TypeError: Index does not support mutable operations

In [None]:
# 常见的Index种类
# * Index，索引
# * Int64Index，整数索引
# * MultiIndex，层级索引
# * DatetimeIndex，时间戳类型

## Series索引

1. index 指定行索引名

In [16]:
ser4=pd.Series(range(5),index=['a','b','c','d','e'])
print(ser4)

a    0
b    1
c    2
d    3
e    4
dtype: int64


2. 行索引

In [31]:
print(ser4['b'])
print(ser4[4])

1
4


3. 切片索引

In [17]:
print(ser4[2:4])
print(ser4['b':'e'])

c    2
d    3
dtype: int64
b    1
c    2
d    3
e    4
dtype: int64


4. 不连续索引

In [18]:
print(ser4[[0,4,3]])
print(ser4[['a','c']])

a    0
e    4
d    3
dtype: int64
a    0
c    2
dtype: int64


5. 布尔索引

In [19]:
ser_bool=ser4>3
print(ser_bool)
print(ser4[ser_bool])

a    False
b    False
c    False
d    False
e     True
dtype: bool
e    4
dtype: int64


## DataFrame索引

1. columns 指定列索引名

In [20]:
dat2=pd.DataFrame(np.random.randn(5,4),columns=['a','b','c','d'])
print(dat2)

          a         b         c         d
0 -0.997646 -0.177095  0.745219  1.431672
1  2.981436  0.610565 -1.451229  0.608769
2  1.828074 -0.828007  1.780564 -0.057615
3  0.229768 -0.281316 -1.221296 -0.927641
4 -0.205912 -1.330973  1.284223  0.952533


2. 列索引

In [27]:
print(dat2['b'])

0   -0.177095
1    0.610565
2   -0.828007
3   -0.281316
4   -1.330973
Name: b, dtype: float64


3. 不连续索引

In [36]:
print(dat2[['a','c']])#返回元组类型

          a         c
0 -0.997646  0.745219
1  2.981436 -1.451229
2  1.828074  1.780564
3  0.229768 -1.221296
4 -0.205912  1.284223


4.列索引

In [37]:
# 使用 ：进行索引
print(dat2[1:2])

          a         b         c         d
1  2.981436  0.610565 -1.451229  0.608769


## 高级索引：标签、位置和混合

1. loc 标签索引

In [38]:
print(ser4['b':'d'])
print(ser4.loc['b':'d'])

b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64


In [39]:
print(dat2['a'])
print(dat2.loc[0:2,'a'])#基于索引内容

0   -0.997646
1    2.981436
2    1.828074
3    0.229768
4   -0.205912
Name: a, dtype: float64
0   -0.997646
1    2.981436
2    1.828074
Name: a, dtype: float64


2. iloc 位置索引(行和列)

In [40]:
print(ser4[1:3])
print(ser4.iloc[1:3])

b    1
c    2
dtype: int64
b    1
c    2
dtype: int64


In [41]:
print(dat2.iloc[0:2,0])#基于索引编号

0   -0.997646
1    2.981436
Name: a, dtype: float64


# Pandas的对齐运算

## Series的对齐运算

1. Series 按行、索引对齐

In [42]:
s1=pd.Series(range(10,20),index=range(10))
s2=pd.Series(range(20,25),index=range(5))
print("s1: ")
print(s1)
print(" ")
print("s2: ")
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
 
s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int64


2. Series的对齐运算

In [43]:
s1+s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

## DataFrame的对齐运算

1. DataFrame按行、列索引对齐

In [44]:
dat3=pd.DataFrame(np.ones((2,2)),columns=['a','b'])
dat4=pd.DataFrame(np.ones((3,3)),columns=['a','b','c'])
print('dat3: ')
print(dat3)
print('dat4: ')
print(dat4)

dat3: 
     a    b
0  1.0  1.0
1  1.0  1.0
dat4: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


2. DataFrame的对齐运算

In [45]:
dat3+dat4

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


## 填充未对齐的数据进行运算

1. fill_value

In [46]:
print(s1)
print(s2)

print(s1.add(s2,fill_value=1))

print(dat3)
print(dat4)
dat3.sub(dat4,fill_value=3.)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
0    20
1    21
2    22
3    23
4    24
dtype: int64
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    16.0
6    17.0
7    18.0
8    19.0
9    20.0
dtype: float64
     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


Unnamed: 0,a,b,c
0,0.0,0.0,2.0
1,0.0,0.0,2.0
2,2.0,2.0,2.0


# Pandas的函数应用

## apply 和 applymap

1. 可直接使用NumPy的函数

In [47]:
dat5=pd.DataFrame(np.random.randn(5,4)-1)
print(dat5)
print(dat5.abs())

          0         1         2         3
0 -1.531346  0.243845 -0.321827 -0.837517
1 -1.848096 -0.175926  0.930495 -0.245552
2 -2.556657 -1.441853 -0.718223 -0.002228
3 -0.866673 -1.384753 -0.505750 -1.682080
4 -2.198906 -0.949085 -3.723829 -0.643172
          0         1         2         3
0  1.531346  0.243845  0.321827  0.837517
1  1.848096  0.175926  0.930495  0.245552
2  2.556657  1.441853  0.718223  0.002228
3  0.866673  1.384753  0.505750  1.682080
4  2.198906  0.949085  3.723829  0.643172


2. 通过apply将函数应用到列或行上

In [48]:
# 注意指定轴的方向，默认axis=0，方向是列
print(dat5.apply(lambda x:x.max()))

0   -0.866673
1    0.243845
2    0.930495
3   -0.002228
dtype: float64


In [49]:
# 注意指定轴的方向，默认axis=1，方向是行
print(dat5.apply(lambda x:x.max(),axis=1))

0    0.243845
1    0.930495
2   -0.002228
3   -0.505750
4   -0.643172
dtype: float64


3. 通过applymap将函数应用到每个数据上

In [50]:
f1=lambda x: "%.2f" %x
print(dat5.applymap(f1))

       0      1      2      3
0  -1.53   0.24  -0.32  -0.84
1  -1.85  -0.18   0.93  -0.25
2  -2.56  -1.44  -0.72  -0.00
3  -0.87  -1.38  -0.51  -1.68
4  -2.20  -0.95  -3.72  -0.64


## 排序

1. 索引排序

In [82]:
s4=pd.Series(range(10,15),index=np.random.randint(5,size=5))
print(s4)
s4.sort_index()

2    10
3    11
2    12
3    13
3    14
dtype: int64


2    10
2    12
3    11
3    13
3    14
dtype: int64

In [52]:
# 对DataFrame操作时注意轴方向
dat6=pd.DataFrame(np.random.randn(3,5),
                  index=np.random.randint(3,size=3),
                 columns=np.random.randint(5,size=5))
print(dat6)
dat6_isort=dat6.sort_index(axis=1,ascending=False)
print(dat6_isort)
dat6_jsort=dat6.sort_index(axis=1,ascending=True)
print(dat6_jsort)

          0         2         1         0         4
0 -0.410258 -0.920055 -0.429385 -0.757786 -1.149722
2 -0.407628 -0.694998  0.189097  1.078417 -0.507992
2  0.130873 -0.650671 -0.915658  0.218977 -1.216731
          4         2         1         0         0
0 -1.149722 -0.920055 -0.429385 -0.410258 -0.757786
2 -0.507992 -0.694998  0.189097 -0.407628  1.078417
2 -1.216731 -0.650671 -0.915658  0.130873  0.218977
          0         0         1         2         4
0 -0.410258 -0.757786 -0.429385 -0.920055 -1.149722
2 -0.407628  1.078417  0.189097 -0.694998 -0.507992
2  0.130873  0.218977 -0.915658 -0.650671 -1.216731


2. 按值排序

In [53]:
dat6_vsort=dat6.sort_values(by=4,ascending=False)
print(dat6_vsort)

          0         2         1         0         4
2 -0.407628 -0.694998  0.189097  1.078417 -0.507992
0 -0.410258 -0.920055 -0.429385 -0.757786 -1.149722
2  0.130873 -0.650671 -0.915658  0.218977 -1.216731


3.dataframe转换成list

In [54]:
import pandas as pd
li=pd.Series([4,31,5,51,3,7],index=['a','b','c','d','e','f'])
li=li.tolist()
li

[4, 31, 5, 51, 3, 7]

## 处理缺失数据

In [55]:
dat7 = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
                       [np.nan, 4., np.nan], [1., 2., 3.]])
print(dat7)

          0         1         2
0  0.223347 -1.318415 -1.258266
1  1.000000  2.000000       NaN
2       NaN  4.000000       NaN
3  1.000000  2.000000  3.000000


1. 判断是否存在缺失值：isnull()

In [56]:
print(dat7.isnull())

       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False


2. 丢弃缺失数据：dropna()

In [57]:
#根据axis轴方向，丢弃包含NaN的行或列
print(dat7.dropna(axis=1))
print(dat7.dropna(axis=0))

          1
0 -1.318415
1  2.000000
2  4.000000
3  2.000000
          0         1         2
0  0.223347 -1.318415 -1.258266
3  1.000000  2.000000  3.000000


3. 填充缺失数据：fillna()

In [58]:
print(dat7.fillna(101.))

            0         1           2
0    0.223347 -1.318415   -1.258266
1    1.000000  2.000000  101.000000
2  101.000000  4.000000  101.000000
3    1.000000  2.000000    3.000000


# 层级索引（hierarchical indexing）

In [59]:
ser5=pd.Series(np.random.randn(12),index=[
                ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'],
                [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
print(ser5)

a  0   -0.194765
   1    0.215063
   2   -2.976224
b  0    1.490704
   1   -0.064013
   2    0.640648
c  0    1.052872
   1   -0.923150
   2    0.145142
d  0   -0.084072
   1    2.054873
   2   -0.017266
dtype: float64


## MultiIndex索引对象

In [60]:
print(type(ser5.index))
print(ser5.index)

<class 'pandas.core.indexes.multi.MultiIndex'>
MultiIndex([('a', 0),
            ('a', 1),
            ('a', 2),
            ('b', 0),
            ('b', 1),
            ('b', 2),
            ('c', 0),
            ('c', 1),
            ('c', 2),
            ('d', 0),
            ('d', 1),
            ('d', 2)],
           )


## 选取子集

1. 外层选取：

In [61]:
print(ser5['b'])

0    1.490704
1   -0.064013
2    0.640648
dtype: float64


2. 内层选取：

In [62]:
print(ser5[:,2])

a   -2.976224
b    0.640648
c    0.145142
d   -0.017266
dtype: float64


## 交换分层顺序

In [63]:
#.swaplevel( )交换内层与外层索引。
print(ser5.swaplevel())

0  a   -0.194765
1  a    0.215063
2  a   -2.976224
0  b    1.490704
1  b   -0.064013
2  b    0.640648
0  c    1.052872
1  c   -0.923150
2  c    0.145142
0  d   -0.084072
1  d    2.054873
2  d   -0.017266
dtype: float64


## 交换并排序分层

In [64]:
#.sort_index( )先对外层索引进行排序，再对内层索引进行排序，默认是升序
print(ser5.swaplevel().sort_index())

0  a   -0.194765
   b    1.490704
   c    1.052872
   d   -0.084072
1  a    0.215063
   b   -0.064013
   c   -0.923150
   d    2.054873
2  a   -2.976224
   b    0.640648
   c    0.145142
   d   -0.017266
dtype: float64


# Pandas统计计算和描述

In [65]:
#axis=0 按列统计，axis=1按行统计
dat8=pd.DataFrame(np.random.randn(5,4),columns=['a','b','c','d'])
print(dat8)

          a         b         c         d
0 -1.277666 -0.339498  0.268980  0.049320
1  0.851745  0.087877  0.245402 -0.961089
2 -0.210805  0.658240 -0.936688 -0.821174
3  0.419167  1.718333 -0.057733 -0.419978
4  0.640996  1.615184 -0.881344 -0.529047


## 常用的统计计算

In [66]:
dat8.sum()

a    0.423437
b    3.740137
c   -1.361384
d   -2.681968
dtype: float64

In [67]:
dat8.max()

a    0.851745
b    1.718333
c    0.268980
d    0.049320
dtype: float64

In [68]:
#skipna 排除缺失值， 默认为True
dat8.min(axis=1,skipna=False)

0   -1.277666
1   -0.961089
2   -0.936688
3   -0.419978
4   -0.881344
dtype: float64

## 常用的统计描述

In [None]:
# describe 产生多个统计数据

In [69]:
print(dat8.describe())

              a         b         c         d
count  5.000000  5.000000  5.000000  5.000000
mean   0.084687  0.748027 -0.272277 -0.536394
std    0.859223  0.911047  0.595690  0.393107
min   -1.277666 -0.339498 -0.936688 -0.961089
25%   -0.210805  0.087877 -0.881344 -0.821174
50%    0.419167  0.658240 -0.057733 -0.529047
75%    0.640996  1.615184  0.245402 -0.419978
max    0.851745  1.718333  0.268980  0.049320


# Pandas分组与聚合

## 分组 (groupby)

In [None]:
# * 对数据集进行分组，然后对每组进行统计分析

# * SQL能够对数据进行过滤，分组聚合

# * pandas能利用groupby进行更加复杂的分组运算

# * 分组运算过程：split->apply->combine

# 1. 拆分：进行分组的根据

# 2. 应用：每个分组运行的计算规则

# 3. 合并：把每个分组的计算结果合并起来

In [1]:
import pandas as pd 
import numpy as np

In [70]:
dict_obj={  "key1":['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randn(8),
            'data2': np.random.randn(8)}
data=pd.DataFrame(dict_obj)
print(data)

  key1   key2     data1     data2
0    a    one  1.378489 -0.491034
1    b    one  0.666807 -2.620402
2    a    two -0.278756  0.466457
3    b  three -1.335627 -0.074286
4    a    two  0.122681 -0.398661
5    b    two  0.666222 -1.660751
6    a    one -0.406009  1.137529
7    a  three  2.385765  0.853231


### GroupBy对象：DataFrameGroupBy，SeriesGroupBy

#### 1. 分组操作

In [None]:
# groupby()进行分组，GroupBy对象没有进行实际运算，只是包含分组的中间数据

# 按列名分组：obj.groupby(‘label’)

In [71]:
print(type(data.groupby('key1')))

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


In [72]:
# dataframe的 data1 列根据 key1 进行分组
print(data['data1'].groupby(data['key1']))

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000018A5910E130>


#### 2. 分组运算

In [None]:
# 对GroupBy对象进行分组运算/多重分组运算，如mean()

# 非数值数据不进行分组运算

In [73]:
# 分组运算
grouped1=data.groupby('key1')
print(grouped1.mean())
grouped2=data['data1'].groupby(data['key1'])
print(grouped2.mean())

         data1     data2
key1                    
a     0.640434  0.313505
b    -0.000866 -1.451813
key1
a    0.640434
b   -0.000866
Name: data1, dtype: float64


In [None]:
# size() 返回每个分组的元素个数

In [74]:
print(grouped1.size())
print(grouped2.size())

key1
a    5
b    3
dtype: int64
key1
a    5
b    3
Name: data1, dtype: int64


#### 3. 按自定义的key分组

In [None]:
# obj.groupby(self_def_key)

# 自定义的key可为列表或多层列表

# obj.groupby([‘label1’, ‘label2’])->多层dataframe

In [75]:
# 按自定义key分组，列表
self_key=[0,1,2,3,4,5,6,7]
print(data.groupby(self_key).size())

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
dtype: int64


In [10]:
# 按自定义key分组，多层列表
print(data.groupby([data['key1'],data['key2']]).size())

key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64


In [11]:
# 按多个列多层分组
grouped3=data.groupby(['key1','key2'])
print(grouped3.size())

key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64


In [12]:
# 多层分组按key的顺序进行
grouped4=data.groupby(['key2','key1'])
print(grouped4.mean())

               data1     data2
key2  key1                    
one   a     0.799571 -1.139142
      b     3.042813  0.549367
three a     1.580783 -0.319847
      b    -0.386822  1.013451
two   a     0.287292  1.227144
      b    -0.475521  2.397701


In [13]:
# unstack可以将多层索引的结果转换成单层的dataframe
print(grouped3.mean().unstack())

         data1                         data2                    
key2       one     three       two       one     three       two
key1                                                            
a     0.799571  1.580783  0.287292 -1.139142 -0.319847  1.227144
b     3.042813 -0.386822 -0.475521  0.549367  1.013451  2.397701


### 二、GroupBy对象支持迭代操作

In [None]:
# 每次迭代返回一个元组 (group_name, group_data)

# 可用于分组数据的具体运算

#### 1. 单层分组

In [76]:
# 单层分组，根据key1
for g_name,g_data in grouped1:
    print(g_name)
    print(g_data)

a
  key1   key2     data1     data2
0    a    one  1.378489 -0.491034
2    a    two -0.278756  0.466457
4    a    two  0.122681 -0.398661
6    a    one -0.406009  1.137529
7    a  three  2.385765  0.853231
b
  key1   key2     data1     data2
1    b    one  0.666807 -2.620402
3    b  three -1.335627 -0.074286
5    b    two  0.666222 -1.660751


### 2. 多层分组

In [17]:
# 多层分组，根据key1 和 key2
for g1_name,g1_data in grouped3:
    print(g1_name)
    print(g1_data)

('a', 'one')
  key1 key2     data1     data2
0    a  one  1.205226 -1.614700
6    a  one  0.393916 -0.663584
('a', 'three')
  key1   key2     data1     data2
7    a  three  1.580783 -0.319847
('a', 'two')
  key1 key2     data1     data2
2    a  two  0.241993  1.625021
4    a  two  0.332592  0.829268
('b', 'one')
  key1 key2     data1     data2
1    b  one  3.042813  0.549367
('b', 'three')
  key1   key2     data1     data2
3    b  three -0.386822  1.013451
('b', 'two')
  key1 key2     data1     data2
5    b  two -0.475521  2.397701


### 三、GroupBy对象可以转换成列表或字典

In [18]:
# GroupBy对象转换list
print(list(grouped1))

[('a',   key1   key2     data1     data2
0    a    one  1.205226 -1.614700
2    a    two  0.241993  1.625021
4    a    two  0.332592  0.829268
6    a    one  0.393916 -0.663584
7    a  three  1.580783 -0.319847), ('b',   key1   key2     data1     data2
1    b    one  3.042813  0.549367
3    b  three -0.386822  1.013451
5    b    two -0.475521  2.397701)]


In [20]:
# GroupBy对象转换dict
print(dict(list(grouped1)))

{'a':   key1   key2     data1     data2
0    a    one  1.205226 -1.614700
2    a    two  0.241993  1.625021
4    a    two  0.332592  0.829268
6    a    one  0.393916 -0.663584
7    a  three  1.580783 -0.319847, 'b':   key1   key2     data1     data2
1    b    one  3.042813  0.549367
3    b  three -0.386822  1.013451
5    b    two -0.475521  2.397701}


#### 1. 按列分组、按数据类型分组

In [21]:
# 按列分组
print(data.dtypes)

key1      object
key2      object
data1    float64
data2    float64
dtype: object


In [22]:
# 按数据类型分组
print(data.groupby(data.dtypes,axis=1).size())
print(data.groupby(data.dtypes,axis=1).sum())

float64    2
object     2
dtype: int64
    float64  object
0 -0.409473    aone
1  3.592179    bone
2  1.867013    atwo
3  0.626629  bthree
4  1.161860    atwo
5  1.922180    btwo
6 -0.269669    aone
7  1.260936  athree


#### 2. 其他分组方法

In [2]:
import pandas as pd
import numpy as np

In [7]:
da1=pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
da1.iloc[1,1:4]=np.NaN
print(da1)

   a    b    c    d  e
A  8  8.0  9.0  4.0  3
B  5  NaN  NaN  NaN  8
C  8  6.0  9.0  6.0  8
D  5  7.0  9.0  2.0  8
E  5  5.0  7.0  1.0  7


#### 3. 通过字典分组

In [8]:
ma_dict={'a':'Python', 'b':'Python', 'c':'Java', 'd':'C', 'e':'Java'}

In [9]:
print(da1.groupby(ma_dict,axis=1).size())

C         1
Java      2
Python    2
dtype: int64


In [10]:
print(da1.groupby(ma_dict,axis=1).count())

   C  Java  Python
A  1     2       2
B  0     1       1
C  1     2       2
D  1     2       2
E  1     2       2


In [11]:
print(da1.groupby(ma_dict,axis=1).count())

   C  Java  Python
A  1     2       2
B  0     1       1
C  1     2       2
D  1     2       2
E  1     2       2


In [13]:
print(da1.groupby(ma_dict,axis=1).sum())

     C  Java  Python
A  4.0  12.0    16.0
B  0.0   8.0     5.0
C  6.0  17.0    14.0
D  2.0  17.0    12.0
E  1.0  14.0    10.0


#### 4. 通过函数分组，函数传入的参数为行索引或列索引

In [27]:
da2=pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['AA', 'BBB', 'CC', 'D', 'EE'])

In [28]:
def group_key(idx):
    return len(idx)
print(da2.groupby(group_key).size())

1    1
2    3
3    1
dtype: int64


#### 5. 通过索引级别分组

In [3]:
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])

In [5]:
da1=pd.DataFrame(np.random.randint(1,10,(5,5)),columns=columns)
print(da1)

language Python Java Python Java Python
index         A    A      B    C      B
0             6    7      6    7      7
1             8    5      2    7      3
2             1    2      2    1      8
3             9    8      8    3      5
4             2    5      2    4      2


## 聚合 (aggregation)

In [None]:
# 数组产生标量的过程，如mean()、count()等

# 常用于对分组后的数据进行计算

In [29]:
dic1 = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1,10, 8),
            'data2': np.random.randint(1,10, 8)}

In [30]:
dic1=pd.DataFrame(dic1)

In [31]:
print(dic1)

  key1   key2  data1  data2
0    a    one      1      9
1    b    one      9      1
2    a    two      9      3
3    b  three      6      9
4    a    two      8      4
5    b    two      8      1
6    a    one      9      9
7    a  three      3      5


#### 1. 内置的聚合函数

In [None]:
# sum(), mean(), max(), min(), count(), size(), describe()

In [32]:
print(dic1.groupby('key1').count())

      key2  data1  data2
key1                    
a        5      5      5
b        3      3      3


In [33]:
print(dic1.groupby('key1').size())  #统计key1中索引的个数

key1
a    5
b    3
dtype: int64


In [13]:
print(dic1.groupby('key2').size())

key2
one      3
three    2
two      3
dtype: int64


In [14]:
print(dic1.groupby('key2').count())#统计关键词的个数

       key1  data1  data2
key2                     
one       3      3      3
three     2      2      2
two       3      3      3


In [15]:
print(dic1.groupby('key1').describe())

     data1                                         data2                      \
     count mean       std  min  25%  50%  75%  max count      mean       std   
key1                                                                           
a      5.0  4.8  3.768289  1.0  1.0  5.0  8.0  9.0   5.0  3.400000  1.673320   
b      3.0  8.0  1.000000  7.0  7.5  8.0  8.5  9.0   3.0  4.666667  3.785939   

                               
      min  25%  50%  75%  max  
key1                           
a     2.0  2.0  3.0  4.0  6.0  
b     2.0  2.5  3.0  6.0  9.0  


#### 2. 可自定义函数，传入agg方法中

In [None]:
# grouped.agg(func)

# func的参数为groupby索引对应的记录

In [16]:
def pak(df):
    return df.max()-df.min()

In [17]:
print(dic1.groupby('key1').agg(pak))

      data1  data2
key1              
a         8      4
b         2      7


  print(dic1.groupby('key1').agg(pak))


#### 3. 应用多个聚合函数

In [None]:
# 同时应用多个函数进行聚合操作，使用函数列表

In [18]:
 # 默认列名为函数名
print(dic1.groupby('key1').agg(['mean','std','count',pak]))

     data1                         data2                    
      mean       std count pak      mean       std count pak
key1                                                        
a      4.8  3.768289     5   8  3.400000  1.673320     5   4
b      8.0  1.000000     3   2  4.666667  3.785939     3   7


  print(dic1.groupby('key1').agg(['mean','std','count',pak]))


In [20]:
# 通过元组提供新的列名
print(dic1.groupby('key1').agg(['mean', 'std', 'count', ('range', pak)]))

     data1                           data2                      
      mean       std count range      mean       std count range
key1                                                            
a      4.8  3.768289     5     8  3.400000  1.673320     5     4
b      8.0  1.000000     3     2  4.666667  3.785939     3     7


  print(dic1.groupby('key1').agg(['mean', 'std', 'count', ('range', pak)]))


#### 4. 对不同的列分别作用不同的聚合函数，使用dict

In [22]:
dict={'data1':'mean','data2':'sum'}
print(dic1.groupby('key2').agg(dict))

          data1  data2
key2                  
one    4.666667     15
three  8.500000      4
two    5.666667     12


In [23]:
dict1 = {'data1':['mean','max'],
                'data2':'sum'}
print(dic1.groupby('key1').agg(dict1))

     data1     data2
      mean max   sum
key1                
a      4.8   9    17
b      8.0   9    14


## 数据的分组运算

In [34]:
dic2 = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1, 10, 8),
            'data2': np.random.randint(1, 10, 8)}

In [None]:
# add_prefix(),增加前缀函数

In [35]:
dic2=pd.DataFrame(dic2)
print(dic2)

  key1   key2  data1  data2
0    a    one      2      1
1    b    one      2      2
2    a    two      3      9
3    b  three      6      2
4    a    two      1      6
5    b    two      6      8
6    a    one      4      4
7    a  three      1      3


In [36]:
# 按key1分组后，计算data1，data2的统计信息并附加到原始表格中，并添加表头前缀
k1_sum=dic2.groupby('key1').sum().add_prefix('sum_')
print(k1_sum)

      sum_data1  sum_data2
key1                      
a            11         23
b            14         12


1. merge进行外连接

In [37]:
k1_mer_sum=pd.merge(dic1,k1_sum,left_on='key1',right_index=True)
print(k1_mer_sum)

  key1   key2  data1  data2  sum_data1  sum_data2
0    a    one      1      9         11         23
2    a    two      9      3         11         23
4    a    two      8      4         11         23
6    a    one      9      9         11         23
7    a  three      3      5         11         23
1    b    one      9      1         14         12
3    b  three      6      9         14         12
5    b    two      8      1         14         12


#### groupby.apply(func)

In [38]:
import pandas as pd
data_path='./train.csv'
datax=pd.read_csv(data_path,usecols=['PassengerId','Pclass','Sex','Age'])

FileNotFoundError: [Errno 2] No such file or directory: './train.csv'

In [83]:

print(datax.head(30))

    PassengerId  Pclass     Sex   Age
0             1       3    male  22.0
1             2       1  female  38.0
2             3       3  female  26.0
3             4       1  female  35.0
4             5       3    male  35.0
5             6       3    male   NaN
6             7       1    male  54.0
7             8       3    male   2.0
8             9       3  female  27.0
9            10       2  female  14.0
10           11       3  female   4.0
11           12       1  female  58.0
12           13       3    male  20.0
13           14       3    male  39.0
14           15       3  female  14.0
15           16       2  female  55.0
16           17       3    male   2.0
17           18       2    male   NaN
18           19       3  female  31.0
19           20       3  female   NaN
20           21       2    male  35.0
21           22       2    male  34.0
22           23       3  female  15.0
23           24       1    male  28.0
24           25       3  female   8.0
25          

In [84]:
datax.fillna({"Age":0.},inplace=True)

In [85]:
print(datax.head(30))

    PassengerId  Pclass     Sex   Age
0             1       3    male  22.0
1             2       1  female  38.0
2             3       3  female  26.0
3             4       1  female  35.0
4             5       3    male  35.0
5             6       3    male   0.0
6             7       1    male  54.0
7             8       3    male   2.0
8             9       3  female  27.0
9            10       2  female  14.0
10           11       3  female   4.0
11           12       1  female  58.0
12           13       3    male  20.0
13           14       3    male  39.0
14           15       3  female  14.0
15           16       2  female  55.0
16           17       3    male   2.0
17           18       2    male   0.0
18           19       3  female  31.0
19           20       3  female   0.0
20           21       2    male  35.0
21           22       2    male  34.0
22           23       3  female  15.0
23           24       1    male  28.0
24           25       3  female   8.0
25          

In [89]:
#返回每个分组按 column 的 top n 数据
def top_n(df,n=5,column="Age"):
    return df.sort_values(by=column,ascending=False)[:10]
print(datax.groupby('Pclass').apply(top_n))

            PassengerId  Pclass     Sex   Age
Pclass                                       
1      630          631       1    male  80.0
       493          494       1    male  71.0
       96            97       1    male  71.0
       745          746       1    male  70.0
       456          457       1    male  65.0
       54            55       1    male  65.0
       438          439       1    male  64.0
       545          546       1    male  64.0
       275          276       1  female  63.0
       252          253       1    male  62.0
2      672          673       2    male  70.0
       33            34       2    male  66.0
       570          571       2    male  62.0
       684          685       2    male  60.0
       232          233       2    male  59.0
       772          773       2  female  57.0
       626          627       2    male  57.0
       15            16       2  female  55.0
       774          775       2  female  54.0
       317          318       2   

1. 产生层级索引：外层索引是分组名，内层索引是df_obj的行索引

In [90]:
# apply函数接收的参数会传入自定义的函数中
print(datax.groupby('Pclass').apply(top_n,n=10,column='Age'))

            PassengerId  Pclass     Sex   Age
Pclass                                       
1      630          631       1    male  80.0
       493          494       1    male  71.0
       96            97       1    male  71.0
       745          746       1    male  70.0
       456          457       1    male  65.0
       54            55       1    male  65.0
       438          439       1    male  64.0
       545          546       1    male  64.0
       275          276       1  female  63.0
       252          253       1    male  62.0
2      672          673       2    male  70.0
       33            34       2    male  66.0
       570          571       2    male  62.0
       684          685       2    male  60.0
       232          233       2    male  59.0
       772          773       2  female  57.0
       626          627       2    male  57.0
       15            16       2  female  55.0
       774          775       2  female  54.0
       317          318       2   

2. 禁止层级索引, group_keys=False

In [92]:
print(datax.groupby('Pclass',group_keys=False).apply(top_n,n=10,column='Age'))

     PassengerId  Pclass     Sex   Age
630          631       1    male  80.0
493          494       1    male  71.0
96            97       1    male  71.0
745          746       1    male  70.0
456          457       1    male  65.0
54            55       1    male  65.0
438          439       1    male  64.0
545          546       1    male  64.0
275          276       1  female  63.0
252          253       1    male  62.0
672          673       2    male  70.0
33            34       2    male  66.0
570          571       2    male  62.0
684          685       2    male  60.0
232          233       2    male  59.0
772          773       2  female  57.0
626          627       2    male  57.0
15            16       2  female  55.0
774          775       2  female  54.0
317          318       2    male  54.0
851          852       3    male  74.0
116          117       3    male  70.5
280          281       3    male  65.0
483          484       3  female  63.0
326          327       3 

# 数据清洗

## 数据连接(pd.merge)

In [None]:
# pd.merge

# 根据单个或多个键将不同DataFrame的行连接起来

# 类似数据库的连接操作

In [93]:
import pandas as pd
import numpy as np

In [39]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data2' : np.random.randint(0,10,3)})

In [40]:
print(df1)
print(df2)

  key  data1
0   b      8
1   b      3
2   a      8
3   c      9
4   a      9
5   a      5
6   b      4
  key  data2
0   a      2
1   b      9
2   d      0


1. 默认将重叠列的列名作为“外键”进行连接

In [41]:
print(pd.merge(df1,df2))

  key  data1  data2
0   b      8      9
1   b      3      9
2   b      4      9
3   a      8      2
4   a      9      2
5   a      5      2


2. on显示指定“外键”

In [42]:
print(pd.merge(df1,df2,on='key'))

  key  data1  data2
0   b      8      9
1   b      3      9
2   b      4      9
3   a      8      2
4   a      9      2
5   a      5      2


3. left_on，左侧数据的“外键”，right_on，右侧数据的“外键”

In [43]:
# 更改列名
dat1 = df1.rename(columns={'key':'key1'})
dat2 = df2.rename(columns={'key':'key2'})

In [44]:
print(pd.merge(dat1,dat2,left_on='key1',right_on='key2'))

  key1  data1 key2  data2
0    b      8    b      9
1    b      3    b      9
2    b      4    b      9
3    a      8    a      2
4    a      9    a      2
5    a      5    a      2


In [None]:
# 默认是“内连接”(inner)，即结果中的键是交集

# how指定连接方式

4. “外连接”(outer)，结果中的键是并集

In [45]:
print(pd.merge(dat1,dat2,left_on='key1',right_on='key2',how='outer'))

  key1  data1 key2  data2
0    b    8.0    b    9.0
1    b    3.0    b    9.0
2    b    4.0    b    9.0
3    a    8.0    a    2.0
4    a    9.0    a    2.0
5    a    5.0    a    2.0
6    c    9.0  NaN    NaN
7  NaN    NaN    d    0.0


5. “左连接”(left)

In [46]:
print(pd.merge(dat1,dat2,left_on='key1',right_on='key2',how='left'))

  key1  data1 key2  data2
0    b      8    b    9.0
1    b      3    b    9.0
2    a      8    a    2.0
3    c      9  NaN    NaN
4    a      9    a    2.0
5    a      5    a    2.0
6    b      4    b    9.0


6. “右连接”(right)

In [47]:
print(pd.merge(dat1,dat2,left_on='key1',right_on='key2',how='right'))

  key1  data1 key2  data2
0    a    8.0    a      2
1    a    9.0    a      2
2    a    5.0    a      2
3    b    8.0    b      9
4    b    3.0    b      9
5    b    4.0    b      9
6  NaN    NaN    d      0


7. 处理重复列名

In [None]:
# suffixes，默认为_x, _y

In [48]:
dat3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data' : np.random.randint(0,10,7)})
dat4= pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data' : np.random.randint(0,10,3)})

In [49]:
print(dat3)
print(dat4)

  key  data
0   b     5
1   b     5
2   a     9
3   c     4
4   a     6
5   a     2
6   b     3
  key  data
0   a     1
1   b     2
2   d     8


In [50]:
print(pd.merge(dat3,dat4,on='key',suffixes=('_left','_right')))

  key  data_left  data_right
0   b          5           2
1   b          5           2
2   b          3           2
3   a          9           1
4   a          6           1
5   a          2           1


8. 按索引连接

In [None]:
# left_index=True或right_index=True

In [51]:
df3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df4 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])

In [52]:
print(pd.merge(df3,df4,left_on='key',right_index=True))

  key  data1  data2
0   b      3      8
1   b      3      8
6   b      5      8
2   a      0      3
4   a      7      3
5   a      0      3


## 数据合并(pd.concat)

1. NumPy的concat

In [53]:
arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))
print(arr1)
print(arr2)

[[3 2 3 0]
 [8 3 8 4]
 [4 9 9 0]]
[[8 1 1 7]
 [0 2 6 2]
 [7 4 2 3]]


In [54]:
print(np.concatenate([arr1,arr2]))

[[3 2 3 0]
 [8 3 8 4]
 [4 9 9 0]
 [8 1 1 7]
 [0 2 6 2]
 [7 4 2 3]]


In [55]:
print(np.concatenate([arr1,arr2],axis=1))

[[3 2 3 0 8 1 1 7]
 [8 3 8 4 0 2 6 2]
 [4 9 9 0 7 4 2 3]]


2. pd.concat

In [None]:
# 注意指定轴方向，默认axis=0

# join指定合并方式，默认为outer

# Series合并时查看行索引有无重复

1) index 没有重复的情况

In [56]:
ser1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))

In [57]:
print(ser1)
print(ser2)
print(ser3)

0    9
1    2
2    7
3    0
4    3
dtype: int32
5    9
6    4
7    6
8    5
dtype: int32
9     1
10    4
11    7
dtype: int32


In [113]:
print(pd.concat([ser1,ser2,ser3]))

0     8
1     1
2     8
3     5
4     8
5     2
6     9
7     2
8     9
9     9
10    3
11    6
dtype: int32


In [58]:
print(pd.concat([ser1,ser2,ser3],axis=1))

      0    1    2
0   9.0  NaN  NaN
1   2.0  NaN  NaN
2   7.0  NaN  NaN
3   0.0  NaN  NaN
4   3.0  NaN  NaN
5   NaN  9.0  NaN
6   NaN  4.0  NaN
7   NaN  6.0  NaN
8   NaN  5.0  NaN
9   NaN  NaN  1.0
10  NaN  NaN  4.0
11  NaN  NaN  7.0


2) index 有重复的情况

In [59]:
# index 有重复的情况
ser4 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser5 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser6 = pd.Series(np.random.randint(0, 10, 3), index=range(3))

In [60]:
print(ser4)
print(ser5)
print(ser6)

0    9
1    7
2    7
3    4
4    2
dtype: int32
0    8
1    4
2    1
3    0
dtype: int32
0    9
1    1
2    6
dtype: int32


In [61]:
print(pd.concat([ser4,ser5,ser6]))

0    9
1    7
2    7
3    4
4    2
0    8
1    4
2    1
3    0
0    9
1    1
2    6
dtype: int32


3) DataFrame合并时同时查看行索引和列索引有无重复

In [62]:
df4= pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
                       columns=['A', 'B'])
df5 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
                       columns=['C', 'D'])

In [63]:
print(df4)
print(df5)

   A  B
a  5  7
b  9  7
c  2  3
   C  D
a  5  9
b  9  8


In [64]:
print(pd.concat([df4,df5]))

     A    B    C    D
a  5.0  7.0  NaN  NaN
b  9.0  7.0  NaN  NaN
c  2.0  3.0  NaN  NaN
a  NaN  NaN  5.0  9.0
b  NaN  NaN  9.0  8.0


In [65]:
print(pd.concat([df4,df5],axis=1,join='inner'))

   A  B  C  D
a  5  7  5  9
b  9  7  9  8


## 数据重构

In [None]:
# 1. stack

# 将列索引旋转为行索引，完成层级索引

# DataFrame->Series

In [66]:
da1 = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])

In [67]:
print(da1)

   data1  data2
0      7      2
1      9      4
2      2      0
3      1      3
4      2      2


In [68]:
stacked=da1.stack()
print(stacked)

0  data1    7
   data2    2
1  data1    9
   data2    4
2  data1    2
   data2    0
3  data1    1
   data2    3
4  data1    2
   data2    2
dtype: int32


2. unstack

In [None]:
# 将层级索引展开

# Series->DataFrame

# 认操作内层索引，即level=-1

In [69]:
print(stacked.unstack())

   data1  data2
0      7      2
1      9      4
2      2      0
3      1      3
4      2      2


In [70]:
# 通过level指定操作索引的级别
print(stacked.unstack(level=0))

       0  1  2  3  4
data1  7  9  2  1  2
data2  2  4  0  3  2


## 数据转换

一、 处理重复数据

1 duplicated() 返回布尔型Series表示每行是否为重复行

In [71]:
dfx = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
                       'data2' : np.random.randint(0, 4, 8)})

In [72]:
print(dfx)

  data1  data2
0     a      0
1     a      1
2     a      0
3     a      1
4     b      2
5     b      2
6     b      2
7     b      1


In [73]:
print(dfx.duplicated())

0    False
1    False
2     True
3     True
4    False
5     True
6     True
7    False
dtype: bool


2 drop_duplicates() 过滤重复行

In [74]:
print(dfx.drop_duplicates())

  data1  data2
0     a      0
1     a      1
4     b      2
7     b      1


In [75]:
print(dfx.drop_duplicates('data2'))

  data1  data2
0     a      0
1     a      1
4     b      2


3. 根据map传入的函数对每行或每列进行转换

Series根据map传入的函数对每行或每列进行转换

In [76]:
ser6 = pd.Series(np.random.randint(0,10,10))

In [77]:
print(ser6)

0    4
1    5
2    9
3    7
4    5
5    0
6    9
7    4
8    7
9    7
dtype: int32


In [78]:
print(ser6.map(lambda x:x**2))

0    16
1    25
2    81
3    49
4    25
5     0
6    81
7    16
8    49
9    49
dtype: int64


二、数据替换

replace根据值的内容进行替换

In [79]:
# 单个值替换单个值
print(ser6.replace(1,-10))

0    4
1    5
2    9
3    7
4    5
5    0
6    9
7    4
8    7
9    7
dtype: int32


In [80]:
# 多个值替换一个值
print(ser6.replace([6,8],-11))

0    4
1    5
2    9
3    7
4    5
5    0
6    9
7    4
8    7
9    7
dtype: int32


In [81]:
# 多个值替换多个值
print(ser6.replace([5,9],[1000,2222]))

0       4
1    1000
2    2222
3       7
4    1000
5       0
6    2222
7       4
8       7
9       7
dtype: int32
