# Part 4: String Operations (Slicing)

In [4]:
import polars as pl
pl.Config.set_tbl_rows(10)

df = pl.read_csv('data/chicago_food_inspections.csv')
df

Name,Risk
str,str
"""PRAIRIELAND MO…","""Risk 1 (High)"""
"""EL RUISENOR DA…","""Risk 1 (High)"""
"""KING CRAB HOUS…","""Risk 1 (High)"""
"""CATERPILLAR CA…","""Risk 1 (High)"""
"""HEALTHY SUBSTA…","""Risk 3 (Low)"""
…,…
"""Cafe 608""","""Risk 1 (High)"""
"""TEMPO CAFE""","""Risk 1 (High)"""
"""MICHAEL'S ON M…","""Risk 1 (High)"""
"""DUNKIN DONUTS/…","""Risk 2 (Medium…"


## Series and DataFrame

In [5]:
type(df['Name'])
# type(df.select(pl.col('Name')))

polars.series.series.Series

## Change letter case

In [6]:
df['Name'].str.to_lowercase()

Name
str
"""prairieland mo…"
"""el ruisenor da…"
"""king crab hous…"
"""caterpillar ca…"
"""healthy substa…"
…
"""cafe 608"""
"""tempo cafe"""
"""michael's on m…"
"""dunkin donuts/…"


In [7]:
df['Name'].str.to_titlecase()

Name
str
"""Prairieland Mo…"
"""El Ruisenor Da…"
"""King Crab Hous…"
"""Caterpillar Ca…"
"""Healthy Substa…"
…
"""Cafe 608"""
"""Tempo Cafe"""
"""Michael's On M…"
"""Dunkin Donuts/…"


In [57]:
# How many for each risk category?
# df['Risk']

(df
 .select(pl.col('Risk')).unique() #show all unique categories

 .group_by('Risk').count() #count the number of each category
 )

Risk,count
str,u32
"""Risk 2 (Medium…",48593
"""Risk 3 (Low)""",21481
"""Risk 1 (High)""",190031
,78
"""All""",52


## Replacing values

In [8]:
# Replace All category with Risk 4
(df
 .select(pl.col('Risk'))
 .drop_nulls() #drop null values
 .select(pl.col('Risk').str.replace_all('All', 'Risk 4 (Extreme)')) #replace all with risk 4
 .unique() #show all new unique categories
 )

Risk
str
"""Risk 3 (Low)"""
"""Risk 4 (Extrem…"
"""Risk 1 (High)"""
"""Risk 2 (Medium…"


## Slicing

In [9]:
df = (df
 .select(pl.col('Risk'))
 .drop_nulls() #drop null values
 .select(pl.col('Risk').str.replace_all('All', 'Risk 4 (Extreme)')) #replace all with risk 4
 )

In [23]:
# Get only the digit from the string

(df
 .select(pl.col('Risk').str.slice(5,1)) #get the digit from the string

 .select(pl.col('Risk').str.extract(r'(\d)')) #extract the number
 )

Risk
str
"""1"""
"""1"""
"""1"""
"""1"""
"""3"""
…
"""1"""
"""1"""
"""1"""
"""2"""


In [36]:
# What if we wanted the actual classification?

(df
#  .select(pl.col('Risk'))#.str.slice(-2,)) #get the digit from the string
 .with_columns(pl.col('Risk').str.extract(r'.*(Medium|High|Low).*').alias('Risk_'))
 )


Risk,Risk_
str,str
"""Risk 1 (High)""","""High"""
"""Risk 1 (High)""","""High"""
"""Risk 1 (High)""","""High"""
"""Risk 1 (High)""","""High"""
"""Risk 3 (Low)""","""Low"""
…,…
"""Risk 1 (High)""","""High"""
"""Risk 1 (High)""","""High"""
"""Risk 1 (High)""","""High"""
"""Risk 2 (Medium…","""Medium"""


## Boolean Methods

In [37]:
'Pizza' in "Jet's Pizza"

True

In [39]:
'pizza' in "Jet's Pizza"

False

In [49]:
# Find a restraurant with the word 'king' in it
df = pl.read_csv('data/chicago_food_inspections.csv')

(df
#  .select(pl.col('Name').str.to_lowercase().str.contains('king')) #not very helpful.
 .filter(pl.col('Name').str.to_lowercase().str.contains('king')) #now we see rows with king.
 )

Name,Risk
str,str
"""KING CRAB HOUS…","""Risk 1 (High)"""
"""TACO BURRITO K…","""Risk 1 (High)"""
"""KING SWEETS""","""Risk 1 (High)"""
"""KING CRAB HOUS…","""Risk 1 (High)"""
"""BURGER KING""","""Risk 2 (Medium…"
…,…
"""BURGER KING #7…","""Risk 2 (Medium…"
"""PEORIA PACKING…","""Risk 2 (Medium…"
"""BURGER KING""","""Risk 2 (Medium…"
"""PARK PACKING""","""Risk 2 (Medium…"


In [50]:
# Get names that start with 'taco'
(df
 .filter(pl.col('Name').str.to_lowercase().str.starts_with('taco'))
 )

Name,Risk
str,str
"""TACO BURRITO K…","""Risk 1 (High)"""
"""TACOS EL TIO #…","""Risk 1 (High)"""
"""TACO MORO""","""Risk 1 (High)"""
"""TACO MORO""","""Risk 1 (High)"""
"""TACO HOUSE""","""Risk 1 (High)"""
…,…
"""TACO FRESCO""","""Risk 1 (High)"""
"""TACO BELL""","""Risk 1 (High)"""
"""TACO EL JALICI…","""Risk 1 (High)"""
"""TACO CHINO""","""Risk 1 (High)"""


In [51]:
# Get names that end with 'taco'
(df
 .filter(pl.col('Name').str.to_lowercase().str.ends_with('taco'))
 )

Name,Risk
str,str
"""MESON DEL TACO…","""Risk 1 (High)"""
"""PEPE'S TACO""","""Risk 1 (High)"""
"""QUERETACO""","""Risk 2 (Medium…"
"""GUAPO TACO""","""Risk 1 (High)"""
"""SU TAQUERIA EL…","""Risk 1 (High)"""
…,…
"""CYBER TACO""","""Risk 1 (High)"""
"""PEPE TACO""","""Risk 1 (High)"""
"""ROCKIN TACO""","""Risk 1 (High)"""
"""CYBER TACO""","""Risk 1 (High)"""
