# Vectorized String Operations

In [1]:
import numpy as np
import pandas as pd

### Introducing Pandas String Operations

In [2]:
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [3]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [4]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [5]:
import pandas as pd
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [6]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

## Tables of Pandas String Methods

In [7]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])

    Methods similar to Python string methods

    len() lower() translate() islower()
    ljust() upper() startswith() isupper()
    rjust() find() endswith() isnumeric()
    center() rfind() isalnum() isdecimal()
    zfill() index() isalpha() split()
    strip() rindex() isdigit() rsplit()
    rstrip() capitalize() isspace() partition()
    lstrip() swapcase() istitle() rpartition()

In [8]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [10]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

### Methods using regular expressions

    Table 3-4. Mapping between Pandas methods and functions in Python’s re module
    
    Method     Description
    match()    Call re.match() on each element, returning a Boolean.
    extract()  Call re.match() on each element, returning matched groups as strings.
    findall()  Call re.findall() on each element.
    replace()  Replace occurrences of pattern with some other string.
    contains() Call re.search() on each element, returning a Boolean.
    count()    Count occurrences of pattern.
    split()    Equivalent to str.split(), but accepts regexps.
    rsplit()   Equivalent to str.rsplit(), but accepts regexps.

### Extract the first name from each

In [11]:
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


#### Finding all names that start and end with a consonant

In [12]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

## Miscellaneous methods

    Table 3-5. Other Pandas string methods

    Method          Description
    get()           Index each element
    slice()         Slice each element
    slice_replace() Replace slice in each element with passed value
    cat()           Concatenate strings
    repeat()        Repeat values
    normalize()     Return Unicode form of string
    pad()           Add whitespace to left, right, or both sides of strings
    wrap()          Split long strings into lines with length less than a given width
    join()          Join strings in each element of the Series with passed separator
    get_dummies()   Extract dummy variables as a DataFrame

## Vectorized item access and slicing.

In [13]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [14]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

## Indicator variables

In [16]:
full_monte = pd.DataFrame({'name': monte,
'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [17]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1
