# Vectorized String Operations

## Introducing Pandas String Operations

In [6]:
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [7]:
# In NumPy
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [8]:
# NumPy type error
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [9]:
# let's see how it works with Pandas
import pandas as pd
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [10]:
# skipping over any missing value
names.str.capitalize()


0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

## Tables of Pandas String Methods

In [11]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

### Methods similar to Python string methods

In [12]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [13]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [14]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

### Methods using regular expressions

In [15]:
# getting first name
monte.str.extract('([A-Za-z]+)', expand=False)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [16]:
# another way
monte.str.split().str.get(0)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [17]:
# using regex
# start and end with consonant
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

### Miscellaneous methods

In [18]:
# same as df.str.slice(0, 3)
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [19]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [20]:
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C',
                                    'B|D', 'B|C', 'B|C|D']})
full_monte

Unnamed: 0,info,name
0,B|C|D,Graham Chapman
1,B|D,John Cleese
2,A|C,Terry Gilliam
3,B|D,Eric Idle
4,B|C,Terry Jones
5,B|C|D,Michael Palin


In [21]:
# get_dummies() routine lets quickly split-out these indicator variables into a DataFrame
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


## Example: Recipe Database

In [22]:
# !curl -O http://openrecipes.s3.amazonaws.com/openrecipes.txt
# !mv openrecipes.txt recipeitems-latest.json

# using recipe examples

In [23]:
try:
    recipes = pd.read_json('recipeitems-latest.json')
except ValueError as e:
    print("ValueError:", e)

ValueError: Trailing data


In [24]:
# read the entire file into a Python array
with open('recipeitems-latest.json', 'r') as f:
    # Extract each line
    data = (line.strip() for line in f)
    # Reformat so each line is the element of a list
    data_json = "[{0}]".format(','.join(data))
# read the result as a JSON
recipes = pd.read_json(data_json)

In [25]:
recipes.shape

(1042, 9)

In [26]:
recipes.iloc[0]

cookTime                                                        PT
datePublished                                           2013-04-01
description      Got leftover Easter eggs?    Got leftover East...
image            http://static.thepioneerwoman.com/cooking/file...
ingredients      12 whole Hard Boiled Eggs\n1/2 cup Mayonnaise\...
name                                      Easter Leftover Sandwich
prepTime                                                     PT15M
recipeYield                                                      8
url              http://thepioneerwoman.com/cooking/2013/04/eas...
Name: 0, dtype: object

In [27]:
recipes.ingredients.str.len().describe()

count    1042.000000
mean      358.645873
std       187.332133
min        22.000000
25%       246.250000
50%       338.000000
75%       440.000000
max      3160.000000
Name: ingredients, dtype: float64

In [28]:
# recipe which has the longest ingredient list
recipes.name[np.argmax(recipes.ingredients.str.len())]

  return getattr(obj, method)(*args, **kwds)


'A Nice Berry Pie'

In [29]:
# see how many of the recipes are for breakfast food
recipes.description.str.contains('[Bb]reakfast').sum()

11

In [30]:
# recipe which contains cinnamon ingredient
recipes.ingredients.str.contains('[Cc]innamon').sum()

79

In [31]:
recipes.ingredients.str.contains('[Cc]inamon').sum()

0

### A simple recipe recommender

In [32]:
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
              'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

In [33]:
import re

#  build a Boolean DataFrame consisting of True and False values, indicating whether this ingredient appears in the list
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE))
                             for spice in spice_list))
spice_df.head()

Unnamed: 0,cumin,oregano,paprika,parsley,pepper,rosemary,sage,salt,tarragon,thyme
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [34]:
spice_df.query('cumin == True').head()
# or
spice_df[spice_df['cumin'] == True].head()

Unnamed: 0,cumin,oregano,paprika,parsley,pepper,rosemary,sage,salt,tarragon,thyme
556,True,False,False,False,True,False,False,False,False,False
559,True,False,False,False,False,False,False,True,False,False
570,True,True,False,False,True,False,False,True,False,False
579,True,False,True,False,False,False,False,True,False,False
581,True,False,False,False,False,False,False,True,False,False


In [35]:
selection = spice_df.query('paprika')
len(selection)

16

In [36]:
# getting values from index
recipes.name[selection.index]

568                      Roasted Tomato Soup Recipe
579                  Butterscotch Calypso Bean Soup
583                    Bourbon Baked Sweet Potatoes
585                   Berry Beer Baked Beans Recipe
586                  Green Rice with Smoked Paprika
625     Borlotti Bean Mole with Roast Winter Squash
637                  Pappardelle with Spiced Butter
651           Lively Up Yourself Lentil Soup Recipe
664                   A Quartet of Compound Butters
691                              Feisty Green Beans
733                Vegetarian Split Pea Soup Recipe
773                    Broccoli Cheddar Soup Recipe
794                                     Magic Sauce
898                                Baked Egg Recipe
1010          Herb Jam with Olives and Lemon Recipe
1036                       Liptauer Cheese Crostini
Name: name, dtype: object