In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../Data/HRData.csv')

### Find String type columns in the DataFrame

In [6]:
# Select columns from the dataframe which have the 'object' data type based on Python's string type
df.select_dtypes(include = ['object']).head(2)

Unnamed: 0,Employee_Name,Position,State,DOB,Sex,MaritalDesc,CitizenDesc,HispanicLatino,RaceDesc,DateofHire,DateofTermination,TermReason,EmploymentStatus,Department,ManagerName,RecruitmentSource,PerformanceScore,LastPerformanceReview_Date
0,"Adinolfi, Wilson K",Production Technician I,MA,07/10/83,M,Single,US Citizen,No,White,7/5/2011,,N/A-StillEmployed,Active,Production,Michael Albert,LinkedIn,Exceeds,1/17/2019
1,"Ait Sidi, Karthikeyan",Sr. DBA,MA,05/05/75,M,Married,US Citizen,No,White,3/30/2015,6/16/2016,career change,Voluntarily Terminated,IT/IS,Simon Roup,Indeed,Fully Meets,2/24/2016


In [7]:
# Select columns which have object data type
df.select_dtypes(include = ['object']).columns

Index(['Employee_Name', 'Position', 'State', 'DOB', 'Sex', 'MaritalDesc',
       'CitizenDesc', 'HispanicLatino', 'RaceDesc', 'DateofHire',
       'DateofTermination', 'TermReason', 'EmploymentStatus', 'Department',
       'ManagerName', 'RecruitmentSource', 'PerformanceScore',
       'LastPerformanceReview_Date'],
      dtype='object')

### The String Accessor Class from a Series 

In [41]:
# Consider this projected Series with one String column from the original dataframe
df['RecruitmentSource']

0               LinkedIn
1                 Indeed
2               LinkedIn
3                 Indeed
4          Google Search
             ...        
306             LinkedIn
307        Google Search
308    Employee Referral
309    Employee Referral
310             LinkedIn
Name: RecruitmentSource, Length: 311, dtype: object

In [42]:
# Trying to make all values to Upper case - but results in Error.
# This is because df['RecruitmentSource'] is a Series which does not have String methods.
df['RecruitmentSource'].upper()

AttributeError: 'Series' object has no attribute 'upper'

In [43]:
# A String accessor is required after the Series to add any String operation
# This generates a StringMethods object which exposes all String operations
df['RecruitmentSource'].str

<pandas.core.strings.accessor.StringMethods at 0x11a1a4fa0>

In [44]:
# Now string functions can be used upon the str accessor. These string functions return a Series.
df['RecruitmentSource'].str.upper()

0               LINKEDIN
1                 INDEED
2               LINKEDIN
3                 INDEED
4          GOOGLE SEARCH
             ...        
306             LINKEDIN
307        GOOGLE SEARCH
308    EMPLOYEE REFERRAL
309    EMPLOYEE REFERRAL
310             LINKEDIN
Name: RecruitmentSource, Length: 311, dtype: object

In [45]:
df['RecruitmentSource'].str.lower()

0               linkedin
1                 indeed
2               linkedin
3                 indeed
4          google search
             ...        
306             linkedin
307        google search
308    employee referral
309    employee referral
310             linkedin
Name: RecruitmentSource, Length: 311, dtype: object

In [46]:
# Note that this string function is a Boolean function that says if a row satisfies this condition or not
df['RecruitmentSource'].str.startswith('G')

0      False
1      False
2      False
3      False
4       True
       ...  
306    False
307     True
308    False
309    False
310    False
Name: RecruitmentSource, Length: 311, dtype: bool

In [14]:
# Trying to find records which has values starting with 'employee'; case insensitive
# But this fails to do the startswith after lower()
# This is because lower() again produces a Series and string functions don't apply on Series

df['RecruitmentSource'].str.lower().startswith('employee')

AttributeError: 'Series' object has no attribute 'startswith'

In [15]:
# so the Series from lower() needs to be converted back to StringMethods to be able to use startswith 
df['RecruitmentSource'].str.lower().str.startswith('employee')

0      False
1      False
2      False
3      False
4      False
       ...  
306    False
307    False
308     True
309     True
310    False
Name: RecruitmentSource, Length: 311, dtype: bool

In [16]:
# An alternative but more elegant way is to use "apply" which does not need multiple string accessors
df['RecruitmentSource'].apply(lambda x: x.lower().startswith('employee'))

0      False
1      False
2      False
3      False
4      False
       ...  
306    False
307    False
308     True
309     True
310    False
Name: RecruitmentSource, Length: 311, dtype: bool