## String Object Methods

In [19]:
import pandas as pd
import numpy as np

In [1]:
# In many string munging and scripting applications, built-in string methods are sufficient
val = 'a, b, guido'
val.split(',')

['a', ' b', ' guido']

In [2]:
# split is often combined with strip to trim whitespace (including line breaks)
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [3]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [4]:
'::'.join(pieces)

'a::b::guido'

In [5]:
# Locate substrings (index and find can also be used)
'guido' in val

True

In [8]:
val.find('c')

-1

In [9]:
val.count(',')  # return the number of occurrences

2

In [10]:
val.replace(',', '::')

'a:: b:: guido'

## Regular Expression

In [1]:
import re

In [4]:
text = 'foo     bar\t baz    \tqux'

In [7]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [8]:
# You can compile the regex yourself with re.compile, forming a reusable regex object
regex = re.compile('\s+')

In [9]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [10]:
# If you wanted to get a list of all patterns matching the regex
regex.findall(text)

['     ', '\t ', '    \t']

In [11]:
text = """Dave dave@google.com
    Steve steve@gmail.com
    Rob rob@gmail.com
    Ryan ryan@yahoo.com
    """
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [12]:
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [13]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

__search returns a special match object for the first email address in the text__
__for the preceding regex, the match object can only tell us the start and end position of the pattern in the string__

In [14]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [16]:
text[m.start():m.end()]

'dave@google.com'

In [18]:
# return a new string replacing the occurrences of the pattern
print(regex.sub('REDACTED', text))

Dave REDACTED
    Steve REDACTED
    Rob REDACTED
    Ryan REDACTED
    


## Vectorized String Functions in pandas

In [20]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)

In [21]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [22]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [29]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [38]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [49]:
match = data.str.findall(regex)
match

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [40]:
# There are a couple of ways to do vectorized element retrieval.
# Either use str.get or index into the str attribute
matches = data.str.match(regex)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [48]:
# To access elements in the embedded lists, we can pass an index to either of these functions
match.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [54]:
match.str[0]

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object