# Data Cleaning And Preparation Part_1

## How To Handle Missing Values

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.isnull?

In [3]:
pd.notnull?

In [4]:
pd.Series.fillna?

In [5]:
pd.Series.dropna?

In [6]:
ser_miss = pd.Series(['Python', 'Java', 'C', 'Ruby', np.nan])
ser_miss

0    Python
1      Java
2         C
3      Ruby
4       NaN
dtype: object

In [7]:
ser_miss.isnull()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [8]:
ser_miss[0] = None
ser_miss

0    None
1    Java
2       C
3    Ruby
4     NaN
dtype: object

In [9]:
ser_miss.isnull()

0     True
1    False
2    False
3    False
4     True
dtype: bool

## Filtering the Missing Values

In [10]:
import pandas as pd
import numpy as np

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data_df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                      [3, 4, 5, 1], 
                      [np.nan, np.nan, np.nan, 5], 
                      [np.nan, 3, np.nan, 4]], 
                      columns=list('ABCD'))
data_df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,5.0,1
2,,,,5
3,,3.0,,4


In [15]:
data_df.dropna()

Unnamed: 0,A,B,C,D
1,3.0,4.0,5.0,1


In [16]:
pd.DataFrame.dropna?

In [17]:
data_df.loc[2, 'D']= np.nan
data_df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,5.0,1.0
2,,,,
3,,3.0,,4.0


In [18]:
data_df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,5.0,1.0
3,,3.0,,4.0


In [19]:
data_df['E'] = np.nan
data_df

Unnamed: 0,A,B,C,D,E
0,,2.0,,0.0,
1,3.0,4.0,5.0,1.0,
2,,,,,
3,,3.0,,4.0,


In [20]:
data_df.dropna(how='all', axis=1)

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,5.0,1.0
2,,,,
3,,3.0,,4.0


In [21]:
print(data_df)
data_df.dropna(thresh=1)

     A    B    C    D   E
0  NaN  2.0  NaN  0.0 NaN
1  3.0  4.0  5.0  1.0 NaN
2  NaN  NaN  NaN  NaN NaN
3  NaN  3.0  NaN  4.0 NaN


Unnamed: 0,A,B,C,D,E
0,,2.0,,0.0,
1,3.0,4.0,5.0,1.0,
3,,3.0,,4.0,


In [22]:
print(data_df)
data_df.dropna(thresh=2)

     A    B    C    D   E
0  NaN  2.0  NaN  0.0 NaN
1  3.0  4.0  5.0  1.0 NaN
2  NaN  NaN  NaN  NaN NaN
3  NaN  3.0  NaN  4.0 NaN


Unnamed: 0,A,B,C,D,E
0,,2.0,,0.0,
1,3.0,4.0,5.0,1.0,
3,,3.0,,4.0,


In [23]:
print(data_df)
data_df.dropna(thresh=4)

     A    B    C    D   E
0  NaN  2.0  NaN  0.0 NaN
1  3.0  4.0  5.0  1.0 NaN
2  NaN  NaN  NaN  NaN NaN
3  NaN  3.0  NaN  4.0 NaN


Unnamed: 0,A,B,C,D,E
1,3.0,4.0,5.0,1.0,


#### Filling The Missing Values

In [24]:
print(data_df)

     A    B    C    D   E
0  NaN  2.0  NaN  0.0 NaN
1  3.0  4.0  5.0  1.0 NaN
2  NaN  NaN  NaN  NaN NaN
3  NaN  3.0  NaN  4.0 NaN


In [25]:
data_df.fillna(50)

Unnamed: 0,A,B,C,D,E
0,50.0,2.0,50.0,0.0,50.0
1,3.0,4.0,5.0,1.0,50.0
2,50.0,50.0,50.0,50.0,50.0
3,50.0,3.0,50.0,4.0,50.0


In [26]:
data_df.fillna({'A':20, 'B':50, 'C':40, 'D':60, 'E':70})

Unnamed: 0,A,B,C,D,E
0,20.0,2.0,40.0,0.0,70.0
1,3.0,4.0,5.0,1.0,70.0
2,20.0,50.0,40.0,60.0,70.0
3,20.0,3.0,40.0,4.0,70.0


In [27]:
data_df.fillna({'A':20, 'B':50, 'C':40, 'D':60, 'E':70}, inplace=True)

In [28]:
data_rand = pd.DataFrame(np.random.randn(5, 4))
data_rand

Unnamed: 0,0,1,2,3
0,0.498423,-0.801877,0.620696,-0.701156
1,-1.100612,-0.77665,1.027594,-0.806504
2,-0.568334,-0.270714,-0.617692,0.142333
3,-1.251983,0.49288,-1.119389,-1.422129
4,-0.480195,-0.197971,0.363322,-1.92849


In [29]:
data_rand.iloc[2:, 2]=np.nan
data_rand.iloc[:2, 3]=np.nan
data_rand

Unnamed: 0,0,1,2,3
0,0.498423,-0.801877,0.620696,
1,-1.100612,-0.77665,1.027594,
2,-0.568334,-0.270714,,0.142333
3,-1.251983,0.49288,,-1.422129
4,-0.480195,-0.197971,,-1.92849


In [30]:
data_rand.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,0.498423,-0.801877,0.620696,
1,-1.100612,-0.77665,1.027594,
2,-0.568334,-0.270714,1.027594,0.142333
3,-1.251983,0.49288,1.027594,-1.422129
4,-0.480195,-0.197971,1.027594,-1.92849


In [31]:
data_rand.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,0.498423,-0.801877,0.620696,0.142333
1,-1.100612,-0.77665,1.027594,0.142333
2,-0.568334,-0.270714,,0.142333
3,-1.251983,0.49288,,-1.422129
4,-0.480195,-0.197971,,-1.92849


In [32]:
data_rand.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2,3
0,0.498423,-0.801877,0.620696,
1,-1.100612,-0.77665,1.027594,
2,-0.568334,-0.270714,1.027594,0.142333
3,-1.251983,0.49288,1.027594,-1.422129
4,-0.480195,-0.197971,,-1.92849


In [33]:
ser_miss = pd.Series([1, 2, np.nan, 4, np.nan])
ser_miss

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64

In [34]:
ser_miss.fillna(ser_miss.mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
dtype: float64

# Data Cleaning And Preparation Part_2

## Data Transformation

#### How To Handle Duplicate Rows and Values?

In [35]:
import pandas as pd
import numpy as np

In [36]:
pd.DataFrame.duplicated?

In [37]:
pd.DataFrame.drop_duplicates?

In [38]:
dup = pd.DataFrame({'key1': ['one', 'three'] * 3 + ['three', 'two'], 
              'key2': [1, 1, 2, 3, 3, 4, 4, 5]})
dup

Unnamed: 0,key1,key2
0,one,1
1,three,1
2,one,2
3,three,3
4,one,3
5,three,4
6,three,4
7,two,5


In [39]:
dup.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [40]:
dup.drop_duplicates()

Unnamed: 0,key1,key2
0,one,1
1,three,1
2,one,2
3,three,3
4,one,3
5,three,4
7,two,5


In [41]:
dup['key3'] = np.arange(8)
dup

Unnamed: 0,key1,key2,key3
0,one,1,0
1,three,1,1
2,one,2,2
3,three,3,3
4,one,3,4
5,three,4,5
6,three,4,6
7,two,5,7


In [42]:
dup.drop_duplicates(['key2'])

Unnamed: 0,key1,key2,key3
0,one,1,0
2,one,2,2
3,three,3,3
5,three,4,5
7,two,5,7


In [43]:
dup.drop_duplicates(['key3'])

Unnamed: 0,key1,key2,key3
0,one,1,0
1,three,1,1
2,one,2,2
3,three,3,3
4,one,3,4
5,three,4,5
6,three,4,6
7,two,5,7


In [44]:
dup.drop_duplicates(['key1', 'key2'], keep='last')

Unnamed: 0,key1,key2,key3
0,one,1,0
1,three,1,1
2,one,2,2
3,three,3,3
4,one,3,4
6,three,4,6
7,two,5,7


## Data Transformation

#### How To Transform Data Usig Functions and/or Mapping?

In [45]:
pd.Series.map?

In [46]:
data_tr = pd.DataFrame({'Names': ['Raja', 'vali', 'Salu', 
                                 'Balu', 'Vali', 'mali'], 
                       'Score': [4, 3, 2, 6, 5, 1,]})
data_tr

Unnamed: 0,Names,Score
0,Raja,4
1,vali,3
2,Salu,2
3,Balu,6
4,Vali,5
5,mali,1


In [47]:
match_data_tr = {'raja':'Yellow', 'vali':'Red', 'salu':'Green', 'balu':'Green', 'mali':'Dark'}
match_data_tr

{'raja': 'Yellow',
 'vali': 'Red',
 'salu': 'Green',
 'balu': 'Green',
 'mali': 'Dark'}

In [48]:
lower_str = data_tr['Names'].str.lower()
lower_str

0    raja
1    vali
2    salu
3    balu
4    vali
5    mali
Name: Names, dtype: object

In [49]:
data_tr['Color'] = lower_str.map(match_data_tr)
data_tr

Unnamed: 0,Names,Score,Color
0,Raja,4,Yellow
1,vali,3,Red
2,Salu,2,Green
3,Balu,6,Green
4,Vali,5,Red
5,mali,1,Dark


In [50]:
data_tr['Color'] = data_tr['Names'].map(lambda x: match_data_tr[x.lower()])
data_tr

Unnamed: 0,Names,Score,Color
0,Raja,4,Yellow
1,vali,3,Red
2,Salu,2,Green
3,Balu,6,Green
4,Vali,5,Red
5,mali,1,Dark


## Data Transformation

#### How To Replace Values?

In [51]:
import pandas as pd
import numpy as np

In [52]:
pd.DataFrame.replace?

In [53]:
ser_data = pd.Series([1., -9., 2., -9., -1., 3.])
ser_data

0    1.0
1   -9.0
2    2.0
3   -9.0
4   -1.0
5    3.0
dtype: float64

In [54]:
ser_data.replace(-9, np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4   -1.0
5    3.0
dtype: float64

In [55]:
ser_data.replace([-9, -1], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [56]:
ser_data.replace([-9, -1], [100, 500])

0      1.0
1    100.0
2      2.0
3    100.0
4    500.0
5      3.0
dtype: float64

In [57]:
ser_data.replace({-9:50, -1:60})

0     1.0
1    50.0
2     2.0
3    50.0
4    60.0
5     3.0
dtype: float64

## Data Transformation

#### How To Rename Axis Indexes?

In [58]:
import pandas as pd
import numpy as np

In [59]:
data_tor = pd.DataFrame(np.arange(12).reshape((3, 4)), 
             index=['Apple', 'Banana', 'Grapes'], 
             columns=['one', 'two', 'three', 'four'])
data_tor

Unnamed: 0,one,two,three,four
Apple,0,1,2,3
Banana,4,5,6,7
Grapes,8,9,10,11


In [60]:
upper = lambda x: x[:5].upper()
data_tor.index.map(upper)

Index(['APPLE', 'BANAN', 'GRAPE'], dtype='object')

In [61]:
data_tor.index = data_tor.index.map(upper)

In [62]:
data_tor

Unnamed: 0,one,two,three,four
APPLE,0,1,2,3
BANAN,4,5,6,7
GRAPE,8,9,10,11


In [63]:
pd.DataFrame.rename?

In [64]:
data_tor.rename(index=str.lower, columns=str.title)

Unnamed: 0,One,Two,Three,Four
apple,0,1,2,3
banan,4,5,6,7
grape,8,9,10,11


In [65]:
data_tor.rename(index={'APPLE': 'Appale'}, columns={'two':2})

Unnamed: 0,one,2,three,four
Appale,0,1,2,3
BANAN,4,5,6,7
GRAPE,8,9,10,11


## Data Transformation

#### How To Descretize and/or Bin The Data?

In [66]:
import pandas as pd
import numpy as np

In [67]:
 pd.cut?

In [68]:
 pd.qcut?

In [69]:
s_m = [21, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
s_m

[21, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [70]:
bin_size = [20, 35, 45, 60, 100]

In [71]:
binned = pd.cut(s_m, bin_size)

In [72]:
print(s_m)
binned

[21, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]


[(20, 35], (20, 35], (20, 35], (20, 35], (20, 35], ..., (20, 35], (60, 100], (35, 45], (35, 45], (20, 35]]
Length: 12
Categories (4, interval[int64, right]): [(20, 35] < (35, 45] < (45, 60] < (60, 100]]

In [73]:
binned.codes

array([0, 0, 0, 0, 0, 0, 1, 0, 3, 1, 1, 0], dtype=int8)

In [74]:
binned.categories

IntervalIndex([(20, 35], (35, 45], (45, 60], (60, 100]], dtype='interval[int64, right]')

In [75]:
binned.value_counts()

(20, 35]     8
(35, 45]     3
(45, 60]     0
(60, 100]    1
dtype: int64

In [76]:
binned = pd.cut(s_m, bin_size, right=False)
print(binned.categories)
print(binned.value_counts())

IntervalIndex([[20, 35), [35, 45), [45, 60), [60, 100)], dtype='interval[int64, left]')
[20, 35)     8
[35, 45)     2
[45, 60)     1
[60, 100)    1
dtype: int64


In [77]:
rebinned = pd.cut(s_m, bin_size, labels=['Fail', 'Pass', 'SC', 'FCorFCD'], right=False)

In [78]:
print(rebinned.categories)
print(rebinned.value_counts())

Index(['Fail', 'Pass', 'SC', 'FCorFCD'], dtype='object')
Fail       8
Pass       2
SC         1
FCorFCD    1
dtype: int64


In [79]:
rd = np.random.rand(10)
rd

array([0.08145232, 0.30573382, 0.70350192, 0.03381431, 0.87571749,
       0.71624769, 0.67146472, 0.71720594, 0.9614142 , 0.7635396 ])

In [80]:
pd.cut(rd, 4, precision=1)

[(0.03, 0.3], (0.3, 0.5], (0.5, 0.7], (0.03, 0.3], (0.7, 1.0], (0.5, 0.7], (0.5, 0.7], (0.5, 0.7], (0.7, 1.0], (0.7, 1.0]]
Categories (4, interval[float64, right]): [(0.03, 0.3] < (0.3, 0.5] < (0.5, 0.7] < (0.7, 1.0]]

In [81]:
ran_data = np.random.randn(1000) 
ran_data[:20]

array([-1.05421185, -0.61304061, -1.42308632,  0.52611159, -1.09612013,
       -0.5179268 ,  0.28555504,  0.80009714,  0.65579676,  0.53771356,
        0.14038297, -0.07566214, -0.91772118,  0.16255117, -1.00959249,
        0.50808295,  0.89136583, -0.25175974, -0.61646623,  1.80566214])

In [82]:
quantiles_bins = pd.qcut(ran_data, 6)

In [83]:
quantiles_bins

[(-2.5189999999999997, -0.962], (-0.962, -0.416], (-2.5189999999999997, -0.962], (0.383, 0.839], (-2.5189999999999997, -0.962], ..., (0.383, 0.839], (-0.416, -0.0119], (-0.0119, 0.383], (-0.962, -0.416], (0.839, 4.003]]
Length: 1000
Categories (6, interval[float64, right]): [(-2.5189999999999997, -0.962] < (-0.962, -0.416] < (-0.416, -0.0119] < (-0.0119, 0.383] < (0.383, 0.839] < (0.839, 4.003]]

In [84]:
quantiles_bins.value_counts()

(-2.5189999999999997, -0.962]    167
(-0.962, -0.416]                 167
(-0.416, -0.0119]                166
(-0.0119, 0.383]                 167
(0.383, 0.839]                   166
(0.839, 4.003]                   167
dtype: int64

## Data Transformation

#### How To Detect and Filter Outliers?

In [85]:
import pandas as pd
import numpy as np

In [86]:
ug_data = pd.DataFrame({'A':pd.Series(np.arange(10)), 'B':pd.Series(np.arange(5, 15)),'C':pd.Series(np.arange(10, 20)),
                        'D':pd.Series(np.arange(15, 25))})
ug_data

Unnamed: 0,A,B,C,D
0,0,5,10,15
1,1,6,11,16
2,2,7,12,17
3,3,8,13,18
4,4,9,14,19
5,5,10,15,20
6,6,11,16,21
7,7,12,17,22
8,8,13,18,23
9,9,14,19,24


In [87]:
ug_data.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,4.5,9.5,14.5,19.5
std,3.02765,3.02765,3.02765,3.02765
min,0.0,5.0,10.0,15.0
25%,2.25,7.25,12.25,17.25
50%,4.5,9.5,14.5,19.5
75%,6.75,11.75,16.75,21.75
max,9.0,14.0,19.0,24.0


In [88]:
col = ug_data['A']
col[np.abs(col) > 4]

5    5
6    6
7    7
8    8
9    9
Name: A, dtype: int32

In [89]:
ug_data[(np.abs(ug_data) > 20).any(1)]

Unnamed: 0,A,B,C,D
6,6,11,16,21
7,7,12,17,22
8,8,13,18,23
9,9,14,19,24


## Data Transformation

#### How To Reorder and Select Rondomly?

In [90]:
import pandas as pd
import numpy as np

In [91]:
np.random.permutation?

In [92]:
ran_df = pd.DataFrame(np.arange(20).reshape((5, 4)))
ran_df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [93]:
sampler = np.random.permutation(5)
sampler

array([4, 3, 2, 0, 1])

In [94]:
pd.DataFrame.take?

In [95]:
ran_df.take(sampler)

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7


In [96]:
pd.DataFrame.sample?

In [97]:
ran_df.sample(n=2)

Unnamed: 0,0,1,2,3
4,16,17,18,19
1,4,5,6,7


In [98]:
rep = ran_df.sample(n=10, replace=True)
rep

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
4,16,17,18,19
1,4,5,6,7
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11


## Data Transformation

#### How To Compute Indicator/Dummy Variables?

In [99]:
import pandas as pd
import numpy as np

In [100]:
pd.get_dummies?

In [101]:
data_dummies = pd.DataFrame({'key': ['b', 'a', 'a', 'c', 'a'], 'data1': range(5)})
data_dummies

Unnamed: 0,key,data1
0,b,0
1,a,1
2,a,2
3,c,3
4,a,4


In [102]:
pd.get_dummies(data_dummies['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0


In [103]:
pd.get_dummies(data_dummies['key'], prefix='Key')

Unnamed: 0,Key_a,Key_b,Key_c
0,0,1,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0


In [104]:
data_dummies_df = data_dummies[['data1']].join(pd.get_dummies(data_dummies['key'], prefix='Key'))
data_dummies_df

Unnamed: 0,data1,Key_a,Key_b,Key_c
0,0,0,1,0
1,1,1,0,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0


In [105]:
np.random.seed(42)
v = np.random.rand(10)
v

array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
       0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258])

In [106]:
bins = [0.1, 0.3, 0.5, 0.7, 1.0]

In [107]:
pd.get_dummies(pd.cut(v, bins))

Unnamed: 0,"(0.1, 0.3]","(0.3, 0.5]","(0.5, 0.7]","(0.7, 1.0]"
0,0,1,0,0
1,0,0,0,1
2,0,0,0,1
3,0,0,1,0
4,1,0,0,0
5,1,0,0,0
6,0,0,0,0
7,0,0,0,1
8,0,0,1,0
9,0,0,0,1


# Data Cleaning and Preparation Part_3

## How To Manipulate With Strings?

In [108]:
Python_sentence = 'python,Is, a programming, Language'
Python_sentence

'python,Is, a programming, Language'

In [109]:
str.split?

In [110]:
Python_sentence.split(sep=',')

['python', 'Is', ' a programming', ' Language']

In [111]:
cs = [x.strip() for x in Python_sentence.split(',')]
cs

['python', 'Is', 'a programming', 'Language']

In [112]:
str.join?

In [113]:
':'.join(cs)

'python:Is:a programming:Language'

In [114]:
one, two, three, four = cs
one

'python'

In [115]:
one + ':::' + two + '...>' + three + '#1' + four

'python:::Is...>a programming#1Language'

In [116]:
'python' in cs

True

In [117]:
str.index?

In [118]:
str.find?

In [119]:
print(Python_sentence)
Python_sentence.index(',')

python,Is, a programming, Language


6

In [120]:
Python_sentence.index(';')

ValueError: substring not found

In [None]:
Python_sentence.find(',')

In [None]:
Python_sentence.find(';')

In [None]:
Python_sentence.count(',')

In [None]:
Python_sentence.replace(',', ':')

In [None]:
Python_sentence.replace(',', '')

## How To Use Regular Expressions?

In [None]:
import re

In [None]:
text = "python Is \ta programming\t Language"
text

In [None]:
re.split?

In [None]:
re.split('\s+', text)

In [None]:
regex = re.compile('\s+')

In [None]:
regex.split(text)

In [None]:
re.findall?

In [None]:
regex.findall(text)

In [None]:
# Note: To avoid unwanted escaping with \ in a regular expression, use raw string literals like r'E:\x' instead of the equivalent 'E:\\x'

In [None]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [None]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
text = """Dave dave@google.com 
Steve steve@gmail.com 
Rob rob@gmail.com 
Ryan ryan@yahoo.com"""
text

In [None]:
regex.findall(text)

In [None]:
regex.search?

In [None]:
regex.search(text)

In [None]:
regex.match?

In [None]:
print(regex.match(text))

In [None]:
regex.sub?

In [None]:
print(regex.sub('Python', text))

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [None]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('Paru@hotmail.com')
m

In [None]:
m.groups()

In [None]:
regex.findall(text)

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

## How To Work With The Vectorized String Functions in Pandas?

In [None]:
import pandas as pd

In [None]:
maild = {'Pruthvi': 'pruthvi@google.com', 'Stella': 'stella@gmail.com', 'Roby': 'roby@gmail.com', 'Navar': np.nan}
maild

In [None]:
maild_s = pd.Series(maild)
maild_s

In [None]:
maild_s.isnull()

In [None]:
maild_s.str.contains('gmail')

In [None]:
pattern

In [None]:
pd.Series.str.findall?

In [None]:
maild_s.str.findall(pattern, flags=re.IGNORECASE)

In [None]:
matching = maild_s.str.match(pattern, flags=re.IGNORECASE)
matching

In [None]:
pd.Series.str.get?

In [None]:
print(maild_s)
maild_s.str.get(2)

In [None]:
maild_s.str[:4]