In [1]:
import pandas as pd
import numpy as np
%config Completer.use_jedi=False

## Text data types
---

In [2]:
pd.Series(list('abcde'))

0    a
1    b
2    c
3    d
4    e
dtype: object

In [3]:
pd.Series(list('abcde'), dtype='string')

0    a
1    b
2    c
3    d
4    e
dtype: string

In [4]:
pd.Series(["a", "b", "c"], dtype=pd.StringDtype())

0    a
1    b
2    c
dtype: string

In [6]:
pd.Series(list('abcde')).astype('string')

0    a
1    b
2    c
3    d
4    e
dtype: string

In [12]:
s = pd.Series(["a", 2, np.nan], dtype="string")
print(type(s[2]))
s

<class 'pandas._libs.missing.NAType'>


0       a
1       2
2    <NA>
dtype: string

## Behavior differences
---

In [13]:
s = pd.Series(["a", None, "b"], dtype="string")
s.str.count("a")

0       1
1    <NA>
2       0
dtype: Int64

In [14]:
s2 = pd.Series(["a", None, "b"], dtype="object")
s2.str.count("a")

0    1.0
1    NaN
2    0.0
dtype: float64

## String methods
---

In [15]:
s = pd.Series(
        ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
    )
print(s)
s.str.lower()

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string


0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [16]:
s.str.len()

0       1
1       1
2       1
3       4
4       4
5    <NA>
6       4
7       3
8       3
dtype: Int64

In [17]:
idx = pd.Index([" jack", "jill ", " jesse ", "frank"])
print(idx.str.strip())
print(idx.str.lstrip())
print(idx.str.rstrip())

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')
Index(['jack', 'jill ', 'jesse ', 'frank'], dtype='object')
Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')


## Splitting and replacing strings
---

In [18]:
s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string")
s2

0    a_b_c
1    c_d_e
2     <NA>
3    f_g_h
dtype: string

In [19]:
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

In [20]:
s2.str.split('_').str.get(1)

0       b
1       d
2    <NA>
3       g
dtype: object

In [23]:
s2.str.split('_').str[1]

0       b
1       d
2    <NA>
3       g
dtype: object

### It is easy to expand this to return a DataFrame using expand.

In [28]:
s2.str.split('_', expand=True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [32]:
s2.str.split("_", expand=True, n=1)

Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


In [33]:
s2.str.rsplit("_", expand=True, n=1)

Unnamed: 0,0,1
0,a_b,c
1,c_d,e
2,,
3,f_g,h


In [34]:
s3 = pd.Series(
        ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
        dtype="string",
    )
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6    <NA>
7    CABA
8     dog
9     cat
dtype: string

In [37]:
s3.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)

0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6        <NA>
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: string

## Concatenation
---

In [38]:
s = pd.Series(["a", "b", "c", "d"], dtype="string")

0    a
1    b
2    c
3    d
dtype: string

In [39]:
s.str.cat(sep='-')

'a-b-c-d'

In [40]:
s.str.cat()

'abcd'

## Concatenating a Series and something list-like into a Series
---

In [41]:
s.str.cat(["A", "B", "C", "D"])


0    aA
1    bB
2    cC
3    dD
dtype: string

In [43]:
t = pd.Series(["a", "b", np.nan, "d"], dtype="string")
s.str.cat(t)

0      aa
1      bb
2    <NA>
3      dd
dtype: string

In [44]:
s.str.cat(t, na_rep='-')

0    aa
1    bb
2    c-
3    dd
dtype: string

In [46]:
d = pd.concat([t, s], axis=1)
print(t)
print(s)
print(d)

0       a
1       b
2    <NA>
3       d
dtype: string
0    a
1    b
2    c
3    d
dtype: string
      0  1
0     a  a
1     b  b
2  <NA>  c
3     d  d


In [47]:
s.str.cat(d, na_rep="-")

0    aaa
1    bbb
2    c-c
3    ddd
dtype: string

## Concatenating a Series and many objects into a Series
---

In [48]:
s

0    a
1    b
2    c
3    d
dtype: string

In [49]:
u = pd.Series(["b", "d", "a", "c"], index=[1, 3, 0, 2], dtype="string")
u

1    b
3    d
0    a
2    c
dtype: string

In [50]:
s.str.cat([u, u.to_numpy()], join="left")

0    aab
1    bbd
2    cca
3    ddc
dtype: string

In [51]:
v = pd.Series(["z", "a", "b", "d", "e"], index=[-1, 0, 1, 3, 4], dtype="string")
v

-1    z
 0    a
 1    b
 3    d
 4    e
dtype: string

In [52]:
s.str.cat([v, u, u.to_numpy()], join="outer", na_rep="-")

-1    -z--
 0    aaab
 1    bbbd
 2    c-ca
 3    dddc
 4    -e--
dtype: string

## Indexing with .str
---

In [56]:
s = pd.Series(
        ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
    )
s

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [58]:
s.str[0]

0       A
1       B
2       C
3       A
4       B
5    <NA>
6       C
7       d
8       c
dtype: string

In [59]:
s.str[1]

0    <NA>
1    <NA>
2    <NA>
3       a
4       a
5    <NA>
6       A
7       o
8       a
dtype: string

## Extracting substrings
---

In [61]:
df = pd.Series(
        ["a1", "b2", "c3"],
        dtype="string",
    )
print(df)
df.str.extract(r"([ab])(\d)", expand=False)

0    a1
1    b2
2    c3
dtype: string


Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,


In [62]:
pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(
        r"(?P<letter>[ab])(?P<digit>\d)", expand=False
    )

Unnamed: 0,letter,digit
0,a,1.0
1,b,2.0
2,,


## Extract all matches in each subject (extractall)
---

In [65]:
s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], dtype="string")
s

A    a1a2
B      b1
C      c1
dtype: string

In [66]:
two_groups = "(?P<letter>[a-z])(?P<digit>[0-9])"
s.str.extract(two_groups, expand=True)

Unnamed: 0,letter,digit
A,a,1
B,b,1
C,c,1


In [67]:
s.str.extractall(two_groups)

Unnamed: 0_level_0,Unnamed: 1_level_0,letter,digit
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,a,1
A,1,a,2
B,0,b,1
C,0,c,1


## Testing for strings that match or contain a pattern
---

In [69]:
pattern = r"[0-9][a-z]"
pd.Series(
        ["1", "2", "3a", "3b", "03c", "4dx"],
        dtype="string",
    ).str.contains(pattern)

0    False
1    False
2     True
3     True
4     True
5     True
dtype: boolean

In [70]:
pd.Series(
        ["1", "2", "3a", "3b", "03c", "4dx"],
        dtype="string",
    ).str.match(pattern)

0    False
1    False
2     True
3     True
4    False
5     True
dtype: boolean

In [71]:
pd.Series(
        ["1", "2", "3a", "3b", "03c", "4dx"],
        dtype="string",
    ).str.fullmatch(pattern)

0    False
1    False
2     True
3     True
4    False
5    False
dtype: boolean

## Creating indicator variables
---

In [72]:
s = pd.Series(["a", "a|b", np.nan, "a|c"], dtype="string")
s

0       a
1     a|b
2    <NA>
3     a|c
dtype: string

In [73]:
s.str.get_dummies(sep="|")

Unnamed: 0,a,b,c
0,1,0,0
1,1,1,0
2,0,0,0
3,1,0,1


In [74]:
idx = pd.Index(["a", "a|b", np.nan, "a|c"])
idx.str.get_dummies(sep="|")

MultiIndex([(1, 0, 0),
            (1, 1, 0),
            (0, 0, 0),
            (1, 0, 1)],
           names=['a', 'b', 'c'])