In [1]:
## Working with String Data

import pandas as pd
import numpy as np


In [3]:
string_data = pd.Series(['Ant','Bull',
                        'Cat','Dog',
                        'Elephant'])
string_data
# data type will be "object" as pandas has not assigned data types explicitly

0         Ant
1        Bull
2         Cat
3         Dog
4    Elephant
dtype: object

In [4]:
# using str member variable
string_data.str.lower()

0         ant
1        bull
2         cat
3         dog
4    elephant
dtype: object

In [5]:
string_data.str.upper()

0         ANT
1        BULL
2         CAT
3         DOG
4    ELEPHANT
dtype: object

In [7]:
# other interesting operations such as swapcase
string_data.str.swapcase()

0         aNT
1        bULL
2         cAT
3         dOG
4    eLEPHANT
dtype: object

In [8]:
string_data

0         Ant
1        Bull
2         Cat
3         Dog
4    Elephant
dtype: object

In [9]:
string_data_lower = string_data.str.lower()

string_data_lower

0         ant
1        bull
2         cat
3         dog
4    elephant
dtype: object

In [10]:
string_data_lower.str.capitalize()

0         Ant
1        Bull
2         Cat
3         Dog
4    Elephant
dtype: object

In [11]:
string_data_lower.str.len()

0    3
1    4
2    3
3    3
4    8
dtype: int64

In [13]:
# Now let's try with a different example with different string characters

string_data = pd.Series(["string one",
                        " String-Two ",
                        "  String:three ",
                        np.nan,
                        "#453",
                        "number%57"])
string_data

0         string one
1        String-Two 
2      String:three 
3                NaN
4               #453
5          number%57
dtype: object

In [14]:
string_data.str.upper()

0         STRING ONE
1        STRING-TWO 
2      STRING:THREE 
3                NaN
4               #453
5          NUMBER%57
dtype: object

In [15]:
# Only first letter will be in Capitals
# wherever there is Blank, then it will not
string_data.str.upper().str.capitalize()

0         String one
1        string-two 
2      string:three 
3                NaN
4               #453
5          Number%57
dtype: object

In [16]:
string_data.str.len()

0    10.0
1    12.0
2    15.0
3     NaN
4     4.0
5     9.0
dtype: float64

In [17]:
stripped_data = string_data.str.strip()

stripped_data

0      string one
1      String-Two
2    String:three
3             NaN
4            #453
5       number%57
dtype: object

In [18]:
# Verify by doing a lenth operations on the stripped data string
stripped_data.str.len()

0    10.0
1    10.0
2    12.0
3     NaN
4     4.0
5     9.0
dtype: float64

In [19]:
# Check to see if there are any numeric values using isnumeric function

string_data.str.isnumeric()


0    False
1    False
2    False
3      NaN
4    False
5    False
dtype: object

In [20]:
# Series is object since it contains mixed values
data = pd.Series([1,'#2','3','4','ant'])

data

0      1
1     #2
2      3
3      4
4    ant
dtype: object

In [21]:
data.str.isnumeric()

0      NaN
1    False
2     True
3     True
4    False
dtype: object

In [22]:
# Concatenated with a pipe separator
string_data.str.cat(sep=' | ')

'string one |  String-Two  |   String:three  | #453 | number%57'

In [25]:
string_data.str.cat(['A','B','C','D','E','F'],
                   na_rep='_')

0         string oneA
1        String-Two B
2      String:three C
3                  _D
4               #453E
5          number%57F
dtype: object

In [26]:
##
## Advanced Operations on Strings
##

In [27]:
# Check if any string contains a particular character
string_data.str.contains('#')

0    False
1    False
2    False
3      NaN
4     True
5    False
dtype: object

In [28]:
string_data.str.contains('#|%|:thr')

0    False
1    False
2     True
3      NaN
4     True
5     True
dtype: object

In [29]:
string_data.str.contains('#|%|:thy')

0    False
1    False
2    False
3      NaN
4     True
5     True
dtype: object

In [30]:
# Where character 'e' is found, it will provide the index is returned
# Where charcater 'e' is not found, it will display -1.0
string_data.str.find('e')

0     9.0
1    -1.0
2    12.0
3     NaN
4    -1.0
5     4.0
dtype: float64

In [31]:
# Can be used to search for substrings as well
string_data.str.find('ring')

0    2.0
1    3.0
2    4.0
3    NaN
4   -1.0
5   -1.0
dtype: float64

In [32]:
# Find all instances
# returns a list, occurrence of match
string_data.str.findall('e')

0       [e]
1        []
2    [e, e]
3       NaN
4        []
5       [e]
dtype: object

In [33]:
# Reg Expressions - at least one occurrence
# For upper case characters
string_data.str.findall('[A-Z]+')

0        []
1    [S, T]
2       [S]
3       NaN
4        []
5        []
dtype: object

In [34]:
# Replace every occurence of character match by that value specified
string_data.str.replace('e','*')

0         string on*
1        String-Two 
2      String:thr** 
3                NaN
4               #453
5          numb*r%57
dtype: object

In [35]:
# Arbitrarily complex string or pattern can be defined for match purposes
pattern = r'[a-z]+'

In [36]:
string_data.str.replace(pattern,'###')

0        ### ###
1     S###-T### 
2      S###:### 
3            NaN
4           #453
5         ###%57
dtype: object

In [37]:
# A function which will extract a matched string and convert it into a value
replacement = lambda x: x.group(0).upper()

In [38]:
string_data.str.replace(pattern,replacement)

0         STRING ONE
1        STRING-TWO 
2      STRING:THREE 
3                NaN
4               #453
5          NUMBER%57
dtype: object

In [39]:
string_data.str.count(' ')

0    1.0
1    2.0
2    3.0
3    NaN
4    0.0
5    0.0
dtype: float64

In [41]:
# To repeat all strings 3 times
# However operation on a NaN is always going to yield NaN
string_data.str.repeat(3)

0                   string onestring onestring one
1              String-Two  String-Two  String-Two 
2      String:three   String:three   String:three 
3                                              NaN
4                                     #453#453#453
5                      number%57number%57number%57
dtype: object