In [1]:
import pandas as pd
import numpy as np

In [2]:
df1=pd.DataFrame({'key':['b','b','c','a','a','a','b'],
                 'data1':range(7)})
df2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
# on='key'可以省略，当只有一个键的时候
# how='inner' 默认为内连接，即求交集
pd.merge(df1,df2,on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,3,0
4,a,4,0
5,a,5,0


In [3]:
pd.merge(df1,df2,on='key',how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,c,2,
3,a,3,0.0
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [4]:
# 多对多合并
# e.g df1有3行键为‘a’,df2有2行键为'a'，则on='key'时，关于键'a'，共产生3*2=6行结果
df3=pd.DataFrame({'key':['a','b','a','a','c'],'data3':range(5)})
# 关于'a'共有9行结果
pd.merge(df1,df3,on='key',sort=True)

Unnamed: 0,key,data1,data3
0,a,3,0
1,a,3,2
2,a,3,3
3,a,4,0
4,a,4,2
5,a,4,3
6,a,5,0
7,a,5,2
8,a,5,3
9,b,0,1


In [5]:
# 两行没有共同的键时，可以指定两个dataframe的列
df4=pd.DataFrame({'key4':['a','b','c','a'],'data4':range(4)})
pd.merge(df1,df4,left_on='key',right_on='key4')

Unnamed: 0,key,data1,key4,data4
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,c,2,c,2
4,a,3,a,0
5,a,3,a,3
6,a,4,a,0
7,a,4,a,3
8,a,5,a,0
9,a,5,a,3


In [6]:
# 多个键合并
left=pd.DataFrame({'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
left
right=pd.DataFrame({'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})

pd.merge(left,right,on=['key1','key2'],how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [7]:
# 有两列键相同，但只用一个键合并时
# left_index指出左侧的dataframe用于连接键的列
pd.merge(left,right,on='key1',suffixes=('_left','_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [8]:
# 连接键位于索引
left1=pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [9]:
# 连接键位于索引
right1=pd.DataFrame({'group_val':[3.5,7.]},index=['a','b'])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [10]:
pd.merge(left1,right1,left_on='key',right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [11]:
# 层级索引
lf=pd.DataFrame({'key1':['ohio','ohio','ohio','nevada','nevada'],
                 'key2':[2000,2001,2002,2000,2001],
                'data':np.arange(5.)})
rh=pd.DataFrame(np.arange(12).reshape((6,2)),
                index=[['nevada','nevada','ohio','ohio','ohio','ohio'],[2000,2001,2000,2001,2002,2003]],
                columns=['event1','event2'])
rh

Unnamed: 0,Unnamed: 1,event1,event2
nevada,2000,0,1
nevada,2001,2,3
ohio,2000,4,5
ohio,2001,6,7
ohio,2002,8,9
ohio,2003,10,11


In [12]:
# 层级索引
pd.merge(lf,rh,left_on=['key1','key2'],how='outer',right_index=True,sort=True)

Unnamed: 0,key1,key2,data,event1,event2
3,nevada,2000,3.0,0,1
4,nevada,2001,4.0,2,3
0,ohio,2000,0.0,4,5
1,ohio,2001,1.0,6,7
2,ohio,2002,2.0,8,9
4,ohio,2003,,10,11


In [13]:
lf2=pd.DataFrame([[1,2],[3,4],[5,6]],index=['a','c','e'],columns=['ohio','nevada'])
rh2=pd.DataFrame([[7,8],[9,10],[11,12],[13,14]],index=['b','c','d','e'],columns=['missouri','alabama'])
# 合并双方的索引
pd.merge(lf2,rh2,how='outer',left_index=True,right_index=True)

Unnamed: 0,ohio,nevada,missouri,alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [14]:
# join方法
# 实现按照索引合并，或者合并多个带有相同或者相似字段的dataframe

# 效果和上例类似
lf2.join(rh2,how='outer')

Unnamed: 0,ohio,nevada,missouri,alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [15]:
# ！！！！
# 实现按照索引合并，或者合并多个带有相同或者相似字段的dataframe
# 默认为左连接
left1.join(right1,on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [16]:
another=pd.DataFrame(np.arange(11.,19).reshape((4,2)),index=['a','c','e','f'],columns=['new yokk','oregon'])

# 多个dataframe进行join
lf2.join([rh2,another],how='outer',sort=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  verify_integrity=True)


Unnamed: 0,ohio,nevada,missouri,alabama,new yokk,oregon
a,1.0,2.0,,,11.0,12.0
b,,,7.0,8.0,,
c,3.0,4.0,9.0,10.0,13.0,14.0
d,,,11.0,12.0,,
e,5.0,6.0,13.0,14.0,15.0,16.0
f,,,,,17.0,18.0


In [17]:
# !!!!!!!!
# 轴向连接
# 在某个轴上实现堆叠功能
# 在numpy中
arr=np.arange(12.).reshape(3,4)
np.concatenate([arr,arr],axis=1)

array([[ 0.,  1.,  2.,  3.,  0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.,  4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.,  8.,  9., 10., 11.]])

In [18]:
# 
s1=pd.Series([0,1],index=['a','b'])
s2=pd.Series([2,3,4],index=['c','d','e'])
s3=pd.Series([5,6],index=['f','g'])
# 
pd.concat([s1,s2,s3],axis=0)

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [19]:
pd.concat([s1,s2,s3],axis=1,sort=True)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [20]:
# 创建层次化索引
# keys参数
pd.concat([s1,s1,s3],keys=['one','two','three'])
# pd.concat([s1,s1,s3],keys=['one','two','three'],axis=1)

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [21]:
df1=pd.DataFrame(5+np.arange(6).reshape((3,2)),index=['a','b','c'],columns=['one','two'])
df2=pd.DataFrame(3+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
pd.concat([df1,df2],axis=1,keys=['level1','level2'],sort=False)
# or
pd.concat({'level1':df1,'level2':df2},axis=1,sort=False)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,5,6,3.0,4.0
b,7,8,,
c,9,10,5.0,6.0


In [22]:
# 忽略行索引，该索引可能在实际的工作中是不需要的
df3=pd.DataFrame(np.random.randn(3,4),columns=list('abcd'),index=np.arange(3))
df4=pd.DataFrame(np.random.randn(2,3),columns=list('abd'),index=np.arange(2))

pd.concat([df3,df4],sort=True)

Unnamed: 0,a,b,c,d
0,-0.050589,0.459769,0.098093,-1.001316
1,-1.56491,0.116441,-0.03879,-0.660888
2,0.691756,0.166855,-0.516512,-0.422665
0,1.09548,0.162867,,2.087382
1,0.423961,0.28196,,-0.276796


In [23]:
# 忽略行索引，该索引可能在实际的工作中是不需要的
# 不保留连接轴上的索引，为连接后的dataframe生成新的索引
pd.concat([df3,df4],ignore_index=True,sort=True)

Unnamed: 0,a,b,c,d
0,-0.050589,0.459769,0.098093,-1.001316
1,-1.56491,0.116441,-0.03879,-0.660888
2,0.691756,0.166855,-0.516512,-0.422665
3,1.09548,0.162867,,2.087382
4,0.423961,0.28196,,-0.276796


In [24]:
# 数据整合
# where函数用于表达一种矢量化的if-else
# combine_first

#可以理解为使用参数对象的数据为调用对象的缺失数据打补丁 


from pandas import Series,DataFrame

a=Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=list('fedcba'))
b=Series(np.arange(len(a)),index=list('fedcba'),dtype=np.float64)
b[-1]=np.nan

In [25]:
np.where(pd.isnull(a),b,a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

In [26]:
# 
a.combine_first(b)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [27]:
# 重塑与轴向旋转
data=DataFrame(np.arange(6).reshape((2,3)),
               index=pd.Index(['Ohio','Nevada'],name='state'),
               columns=pd.Index(['one','two','three'],name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Nevada,3,4,5


In [28]:
res=data.stack()
res

state   number
Ohio    one       0
        two       1
        three     2
Nevada  one       3
        two       4
        three     5
dtype: int32

In [29]:
# unstack方法重新编排
res.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Nevada,3,4,5


In [30]:
# 长格式和宽格式的转换
ldata=pd.read_csv("examples/ldata.csv")
ldata

Unnamed: 0,date,item,value
0,1959/3/31 0:00,realgdp,2170.349
1,1959/3/31 0:00,infl,0.0
2,1959/3/31 0:00,unemp,5.8
3,1959/6/30 0:00,realgdp,2778.801
4,1959/6/30 0:00,infl,2.34
5,1959/6/30 0:00,unemp,5.1
6,1959/9/30 0:00,realgdp,2775.488
7,1959/9/30 0:00,infl,2.74
8,1959/9/30 0:00,unemp,5.3
9,1959/12/31 0:00,realgdp,2785.204


In [31]:
ldata.pivot('date','item','value')

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959/12/31 0:00,1951.7724,2785.204,2230.7744
1959/3/31 0:00,0.0,2170.349,5.8
1959/6/30 0:00,2.34,2778.801,5.1
1959/9/30 0:00,2.74,2775.488,5.3


In [32]:
ldata['vaule2']=np.random.randn(len(ldata))
ldata

Unnamed: 0,date,item,value,vaule2
0,1959/3/31 0:00,realgdp,2170.349,0.825507
1,1959/3/31 0:00,infl,0.0,-0.975444
2,1959/3/31 0:00,unemp,5.8,2.441564
3,1959/6/30 0:00,realgdp,2778.801,-0.948134
4,1959/6/30 0:00,infl,2.34,0.838849
5,1959/6/30 0:00,unemp,5.1,0.69767
6,1959/9/30 0:00,realgdp,2775.488,-0.57911
7,1959/9/30 0:00,infl,2.74,0.255305
8,1959/9/30 0:00,unemp,5.3,-1.409221
9,1959/12/31 0:00,realgdp,2785.204,0.730097


In [33]:
pivoted=ldata.pivot('date','item')
pivoted

Unnamed: 0_level_0,value,value,value,vaule2,vaule2,vaule2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959/12/31 0:00,1951.7724,2785.204,2230.7744,0.630002,0.730097,0.962858
1959/3/31 0:00,0.0,2170.349,5.8,-0.975444,0.825507,2.441564
1959/6/30 0:00,2.34,2778.801,5.1,0.838849,-0.948134,0.69767
1959/9/30 0:00,2.74,2775.488,5.3,0.255305,-0.57911,-1.409221


In [34]:
pivoted['value'][:3]

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959/12/31 0:00,1951.7724,2785.204,2230.7744
1959/3/31 0:00,0.0,2170.349,5.8
1959/6/30 0:00,2.34,2778.801,5.1


In [35]:
# 移除重复的数据
df4=DataFrame({'k1':['one']*2+['two']*4+['one'],'k2':[1,1,2,3,3,4,1]})
df4

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,two,2
3,two,3
4,two,3
5,two,4
6,one,1


In [36]:
# 重复项
df4.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [37]:
# 去除重复项
df4.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,two,2
3,two,3
5,two,4


In [38]:
df4['v']=np.arange(7)
# 指定部分列进行重复判断，并去除
df4.drop_duplicates(subset=['k1'],keep='first')

Unnamed: 0,k1,k2,v
0,one,1,0
2,two,2,2


In [39]:
# 利用函数或者隐射进行数据转换
dt=DataFrame({'food':['bacon','pulled pork','bacon','pastrami','corned beef','Bacon','Pastrami','honey ham','nova lox'],
             'ounces':[4,3,12,6,7.5,8,3,5,6]})
dt

# 需要添加的列
meat_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}

In [40]:
# 通过map方法接受一个函数或者含有映射关系的字典对象
dt['food'].map(str.lower)

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [41]:
# dt['animal']=dt['food'].map(str.lower).map(meat_to_animal)
# dt

# another
dt['animal']=dt['food'].map(lambda x:meat_to_animal[x.lower()])
dt

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,Pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [42]:
series=Series(meat_to_animal,name='series')

In [43]:
dt1=dt[['food','ounces']]
dt1

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,Pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [44]:
# dt1.join(series,on='food')
# 均可实现上述的效果
dt1.merge(series,how='left',left_on='food',right_index=True)

Unnamed: 0,food,ounces,series
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,
6,Pastrami,3.0,
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [45]:
# 周索引重命名
dt=DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
dt.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [46]:
# inplace参数 就地修改
dt.rename(index={'Ohio':'Indiana'},inplace=True)
dt

Unnamed: 0,one,two,three,four
Indiana,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [47]:
# 面元划分
ages=[20,22,25,27,21,23,37,31,61,56,40]
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (35, 60], (25, 35], (60, 100], (35, 60], (35, 60]]
Length: 11
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [48]:
cats.value_counts()

(18, 25]     5
(25, 35]     2
(35, 60]     3
(60, 100]    1
dtype: int64

In [49]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [50]:
# 异常值过滤
np.random.seed(12345)
data=DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [52]:
# 选取有一个数的绝对值大于3的行
data[(np.abs(data)>3).any(1)]


Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [55]:
data[np.abs(data)>3]=np.sign(data)*3

In [63]:
df=DataFrame(np.arange(10*4).reshape((10,4)))
# permutation用于产生一个范围内的无序的数组
sampler=np.random.permutation(5)
sampler

array([3, 2, 0, 1, 4])

In [61]:
# shuffle-->混洗
s=np.arange(10)
np.random.shuffle(s)
s

array([6, 7, 1, 0, 5, 3, 9, 8, 2, 4])

In [64]:
# take函数，获取dataframe某几行/列的数据
df.take(sampler,axis=0)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7
4,16,17,18,19


In [67]:
# ##############################################
# ##################哑变量######################
# ##############################################
# 哑变量是指如果dataframe(m*n)的某一列/多列有多中不同的值k，则针对该列的数据，
# 可以衍生出一个m*k的(0,1)矩阵表示这列值出现在dataframe的m行的位置

# e.g
# 对于列值为单值的情况
df=DataFrame({'k1':list('bbacab'),'data':np.arange(6)})
dummies=pd.get_dummies(df['k1'],prefix='k1')
# ??
# df['data']类型为Series,df[['data']]的类型为dataframe
# https://blog.csdn.net/The_Time_Runner/article/details/84555323
df_dummies=df[['data']].join(dummies)
df_dummies

Unnamed: 0,data,k1_a,k1_b,k1_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [69]:
# genres有多个取值，根据genres的值生成哑变量矩阵
mnames=['movie_id','title','genres']
movies=pd.read_csv('examples/movies.csv',names=mnames)
movies[:10]

Unnamed: 0,movie_id,title,genres
0,movieId,title,genres
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,2,Jumanji (1995),Adventure|Children|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,5,Father of the Bride Part II (1995),Comedy
6,6,Heat (1995),Action|Crime|Thriller
7,7,Sabrina (1995),Comedy|Romance
8,8,Tom and Huck (1995),Adventure|Children
9,9,Sudden Death (1995),Action


In [81]:
# 获取genres的迭代列表
genres_iter=(set(x.split('|')) for x in movies.genres)
# set不支持下标操作，可以将set转为list操作
# 获取genres的唯一值得集合
genres=set.union(*genres_iter)
# 构建哑变量矩阵的全0矩阵
genres_dummies=DataFrame(np.zeros((movies.size,len(genres))),columns=genres)
# 构建genres的哑变量矩阵
for i,gen in enumerate(movies.genres):
    genres_dummies.loc[i,gen.split('|')]=1 

In [82]:
movies_windic=movies.join(genres_dummies.add_prefix('Genre_'))

In [87]:
movies_windic.loc[1]

movie_id                                                              1
title                                                  Toy Story (1995)
genres                      Adventure|Animation|Children|Comedy|Fantasy
Genre_Fantasy                                                         1
Genre_Documentary                                                     0
Genre_Adventure                                                       1
Genre_Children                                                        1
Genre_Thriller                                                        0
Genre_War                                                             0
Genre_Sci-Fi                                                          0
Genre_Romance                                                         0
Genre_Horror                                                          0
Genre_Animation                                                       1
Genre_genres                                                    

In [88]:
type(genres_iter)

generator

In [89]:
# 正则表达式
import re

In [93]:
text='foo  bar\t baz  \tqux'
re.split('\s+',text)
# or
regex=re.compile('\s+')
regex.split(text)

# regex.findall(text)

['  ', '\t ', '  \t']

In [114]:
# 注意match,findall,search的区别

# ?: 忽略
pattern=r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex=re.compile(pattern,flags=re.IGNORECASE)
m=regex.match('jinghongpeng@sogou-inc.com')
print(m.groups())

('jinghongpeng', 'sogou-inc', 'com')


In [115]:
text="""
Mark jinghongpeng@sogou-inc.com
Lee  dav@gmail.com
Hunagli@163.com
"""
# findall，search方法可以从文本的任何一个地方进行正则匹配
# findall返回所有的匹配结果，search只返回第一个
regex.findall(text)

[('jinghongpeng', 'sogou-inc', 'com'),
 ('dav', 'gmail', 'com'),
 ('Hunagli', '163', 'com')]

In [126]:
m=regex.search(text)
m.groups()[0]
text[m.start():m.end()]

'jinghongpeng@sogou-inc.com'

In [132]:
# sub函数用于替换，
# \1,\2可用来表示分组
m=regex.sub(r'username:\1, domain:\2, suffix:\3',text)

In [138]:
# 矢量化的字符串函数
data=Series({'Dave':'dava@gmail.com','Alice':'alice@hotmail.com','mark':'mark@163.com','wst':np.nan})
s=data.str
s

<pandas.core.strings.StringMethods at 0x13d9dead9b0>

In [145]:
matches=s.match('([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})',flags=re.IGNORECASE)
matches

Dave     True
Alice    True
mark     True
wst       NaN
dtype: object