In [4]:
import numpy as np
import pandas as pd

In [5]:
# NumPy 向量式運算
x = np.array([2, 3, 5, 7, 11, 13])
print(x * 2)

# NumPy 沒有提供字串的向量式運算
data = ['peter', 'Paul', 'MARY', 'gUIDO']
print([s.capitalize() for s in data])

# 如果有空值會報錯 較為麻煩
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
print([s if s is None else s.capitalize() for s in data])


[ 4  6 10 14 22 26]
['Peter', 'Paul', 'Mary', 'Guido']
['Peter', 'Paul', None, 'Mary', 'Guido']


In [6]:
# Pandas 支援字串的向量化操作
names = pd.Series(data)
print(names.str.capitalize())

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object


In [7]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terru Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

print(monte.str.lower())
print(monte.str.len())
print(monte.str.startswith('T'))
print(monte.str.split())

0    graham chapman
1       john cleese
2     terru gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object
0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64
0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool
0    [Graham, Chapman]
1       [John, Cleese]
2     [Terru, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object


In [8]:
print(monte.str.extract('([A-Za-z]+)', expand=False)) # expand 回傳dataframe 還是series
print(monte.str.findall(r'^[^AEIOU].*[^aeiou]$'))


0     Graham
1       John
2      Terru
3       Eric
4      Terry
5    Michael
dtype: object
0    [Graham Chapman]
1                  []
2     [Terru Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object


In [9]:
# slice() 與 pandas.str的直接切片一樣
print(monte.str[0:3])
print(monte.str.slice(0, 3))

# 也可指定元素
print(monte.str.split().str[-1])

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object
0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object
0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object


In [10]:
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C',
                                    'B|D', 'B|C', 'B|C|D']})
print(full_monte)

# 使用 get_dummies()
print(full_monte['info'].str.get_dummies('|'))

             name   info
0  Graham Chapman  B|C|D
1     John Cleese    B|D
2   Terru Gilliam    A|C
3       Eric Idle    B|D
4     Terry Jones    B|C
5   Michael Palin  B|C|D
   A  B  C  D
0  0  1  1  1
1  0  1  0  1
2  1  0  1  0
3  0  1  0  1
4  0  1  1  0
5  0  1  1  1


In [11]:
# 用pd 讀取json檔 每一行都是一個 json項目 lines=True
recipes = pd.read_json('data/recipeitems.json', lines=True)
print(recipes.shape)

# 看其中一列
print(recipes.iloc[0])

# 看食材欄位的字串數量統計量
print(recipes.ingredients.str.len().describe())

(173278, 17)
_id                                {'$oid': '5160756b96cc62079cc2db15'}
name                                    Drop Biscuits and Sausage Gravy
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
url                   http://thepioneerwoman.com/cooking/2013/03/dro...
image                 http://static.thepioneerwoman.com/cooking/file...
ts                                             {'$date': 1365276011104}
cookTime                                                          PT30M
source                                                  thepioneerwoman
recipeYield                                                          12
datePublished                                                2013-03-11
prepTime                                                          PT10M
description           Late Saturday afternoon, after Marlboro Man ha...
totalTime                                                           NaN
creator                                            

In [12]:
# 檢視max 的資料
print(recipes.name[np.argmax(recipes.ingredients.str.len())])

# 檢視有多少資料是早餐的食譜 使用regex 包含大小寫
print(recipes.description.str.contains('[Bb]reakfast').sum())

# 檢視有多少食譜使用肉桂 使用regex 包含大小寫
print(recipes.ingredients.str.contains('[Cc]innamon').sum())

# 還可以檢視有沒有肉桂拼錯的
print(recipes.ingredients.str.contains('[Cc]inamon').sum())


Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots
3524
10526
11


In [13]:
# 定義想查詢的食材內容
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
              'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

# 建立食材的布林dataframe
import re
spice_df = pd.DataFrame({spice: recipes.ingredients.str.contains(spice, re.IGNORECASE)
                         for spice in spice_list})
print(spice_df.head())

# 可以拿來查詢是否包含特定食材
selection = spice_df.query('parsley & paprika & tarragon')
print(len(selection))

# 把查詢結果當作索引 檢視食譜
print(recipes.name[selection.index])


    salt  pepper  oregano   sage  parsley  rosemary  tarragon  thyme  paprika  \
0  False   False    False   True    False     False     False  False    False   
1  False   False    False  False    False     False     False  False    False   
2   True    True    False  False    False     False     False  False    False   
3  False   False    False  False    False     False     False  False    False   
4  False   False    False  False    False     False     False  False    False   

   cumin  
0  False  
1  False  
2   True  
3  False  
4  False  
10
2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 S