In [2]:
import numpy as np
import pandas as pd

## Working with text Data

In [3]:
a = pd.Series(["a","b","c","d"])
a

0    a
1    b
2    c
3    d
dtype: object

In [4]:
a.str.upper()

0    A
1    B
2    C
3    D
dtype: object

In [5]:
a.astype("string")

0    a
1    b
2    c
3    d
dtype: string

In [7]:
a.astype(pd.StringDtype())

0    a
1    b
2    c
3    d
dtype: string

## Behavior Differences

These are places where the behaviour of StringDtype objects differ from object dtype
1. 

In [10]:
a[a.str.isalnum()]

0    a
1    b
2    c
3    d
dtype: object

In [11]:
#
a[a.str.match("a")]

0    a
dtype: object

In [12]:
a.str.upper()

0    A
1    B
2    C
3    D
dtype: object

In [13]:
a.str.isupper()

0    False
1    False
2    False
3    False
dtype: bool

In [None]:
a

In [15]:
a.str.startswith("a")

0     True
1    False
2    False
3    False
dtype: bool

In [16]:
a.str.title()

0    A
1    B
2    C
3    D
dtype: object

In [17]:
# Since df.columns is an index object, we can use the .str accesor
df = pd.DataFrame(np.random.randn(3,2), columns=["Column A", "Column B"], index=range(3))

In [18]:
df

Unnamed: 0,Column A,Column B
0,-0.576946,-0.63512
1,-0.708234,0.636899
2,1.373051,0.625442


In [22]:
df.columns.str.upper()

Index(['COLUMN A', 'COLUMN B'], dtype='object')

### Splitting and replacing strings

In [23]:
# Methods like split return a Series of lists

In [24]:
s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string")
s2.str.split("_")

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

In [26]:
s2.str.split("_").str.get(1)

0       b
1       d
2    <NA>
3       g
dtype: object

In [29]:
s2.str.split("_").str[0]

0       a
1       c
2    <NA>
3       f
dtype: object

In [30]:
## It is easy to expand this to return a DataFrame using expand
s2.str.split("_", expand=True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


### replace by default replaces regular expressions

In [31]:
s3 = pd.Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog",
                "cat"], dtype="string")

In [35]:
s3.index

RangeIndex(start=0, stop=10, step=1)

In [37]:
s3.str.replace("^.a|dog", "XX-XX", case=False)

0          A
1          B
2          C
3    XX-XXba
4    XX-XXca
5           
6       <NA>
7    XX-XXBA
8      XX-XX
9     XX-XXt
dtype: string

In [39]:
## Some caution must be taken to keep regular expressions in mind.
## For exampl, the following code will cause trouble because of the regular
## expression meaning of $


## Consider the following badly formatted financial data
dollors = pd.Series(["12", "-$10", "$1000"], dtype="string")

## This does what you'd naively expect:
dollors.str.replace("$","")

0      12
1     -10
2    1000
dtype: string

In [40]:
## But this doesn't
dollors.str.replace("-$", "-")

0       12
1     -$10
2    $1000
dtype: string

In [41]:
## To do this we need to escape the special character (for >1 len patterns)
dollors.str.replace(r"-\$","-")

0       12
1      -10
2    $1000
dtype: string

In [42]:
## New Version 0.23.0
# str.replace(), You can set the optional regex parameter to False,
# rather than escaping escaping each character.
# In this case both pat and repl must be strings

## These lines are equivalent 
dollors.str.replace(r'-\$', "-")

0       12
1      -10
2    $1000
dtype: string

In [43]:
dollors.str.replace("-$", "-", regex=False)

0       12
1      -10
2    $1000
dtype: string

In [44]:
## The replace method can also take a callable as replacement.
## It is called on every pat using re.sub(). The callable shouldexpect
## one positional argument ( a regex object) and return a string

## Reverse every lowercase alphabetic word
pat = r'[a-z]+'

def repl(m):
    return m.group(0)[::-1]

In [47]:
pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace(pat, repl)

0    oof 123
1    rab zab
2       <NA>
dtype: string

###### using regex groups

In [None]:
#pat = r"(?P<one>\w+) (?P<two>\w+) (?)


### Concatenation

There are several ways to concatenate a Series or Index, either with itself or others, all based on cat(), resp.
`Index.str.cat.`

In [57]:
s = pd.Series(["a", 'b', "c", 'd'], dtype="string")
t = ["A", "B", "C", "D"]
s.str.cat(t, join = "right")

0    aA
1    bB
2    cC
3    dD
dtype: string

#### Concatenating a Series and something array-like into a Series

In [59]:
d = pd.concat([t,s], axis=1)
d

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

### Extracting Substrings

In [60]:
pd.Series(["a1","b2", "c3"], dtype="string").str.extract(r'([ab])(\d)', expand=False)

Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,


In [None]:
## Named groups like
pd.Series(["a1", 'b2', "c3"], dtype="string").str.extract(r'(?P<letter>))