# Section 6: Working with Text Data

### Intro to the Working with Text Data Section

In [1]:
import pandas as pd

In [3]:
emp = pd.read_csv('./data/chicago.csv')

In [4]:
emp.info() 
#we can see the memory usage is pretty high so we can change the dtype of the department
#column to category which will reduce some memory overhead

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [7]:
#find out how many unique values there are in each column
emp.nunique()
#only 35 departments

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [10]:
emp['Department'] = emp['Department'].astype('category')
emp.info()
# the memory usage has now decreesed from 1002kb to 784kb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.4+ KB


### Common String methods - .lower(), .upper(), .title(), len()

In [11]:
emp = pd.read_csv('./data/chicago.csv')
emp['Department'] = emp['Department'].astype('category')
emp.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [16]:
"HELLO".lower(), 'hello'.upper(), 'hELLO world'.title(), len('HeLlO')

('hello', 'HELLO', 'Hello World', 5)

In [21]:
#employing these string methods on an etire row is a bit different than you would think
emp['Name'].str.title()
#need the .str after the column name
#doesn't actually change it in the initial df
#we need to overwrite the existing column
emp['Position Title'] = emp['Position Title'].str.title()

In [22]:
emp.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


In [23]:
#find the length of each cell in a column
emp['Department'].str.len()

0        11.0
1         6.0
2         6.0
3        16.0
4        11.0
         ... 
32058     6.0
32059     6.0
32060     6.0
32061     4.0
32062     NaN
Name: Department, Length: 32063, dtype: float64

### Use the str.replace() method to replace all occurrences of a character with another

In [37]:
emp = pd.read_csv('./data/chicago.csv').dropna(how = 'all')
emp['Department'] = emp['Department'].astype('category')
emp.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [38]:
#this is how it normally works in python
'Hello World'.replace(' ', '_')

'Hello_World'

In [39]:
#works slightly different in pandas
#want to replace MGNT with Management
emp['Department'] = emp['Department'].str.replace('MGMNT', 'MANAGEMENT')

In [40]:
emp.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [41]:
#lets convert the salary column from strings to floats
emp['Employee Annual Salary'] = emp['Employee Annual Salary'].str.replace('$','').astype(float)
#drops the $sign, converts it to a float, and then overwites the annual salary column in the initial df
emp.head()
#if there were commas in the dollar amount we can just call another .str.replace(',','') before the .astype(float)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [47]:
#now we can do math methods on the annual salaray column
emp['Employee Annual Salary'].sum()

2571506375.36

In [44]:
emp['Employee Annual Salary'].mean()

80204.178633899

In [45]:
emp['Employee Annual Salary'].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

### Filter a DataFrame's Rows with String Methods

In [48]:
emp = pd.read_csv('./data/chicago.csv').dropna(how = 'all')
emp['Department'] = emp['Department'].astype('category')
emp.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [49]:
#want to extract any rows where position title contains water anywhere in the cell
#start by normalizing the text to make sure all the characters are lower cased
emp['Position Title'].str.lower().str.contains('water')
#creates a boolean series if a cell contains 'water' anywehre in that cell

0         True
1        False
2        False
3        False
4        False
         ...  
32057    False
32058    False
32059    False
32060    False
32061    False
Name: Position Title, Length: 32062, dtype: bool

In [50]:
#can pass that boolean series directly into the df to get a the desired df
emp[emp['Position Title'].str.lower().str.contains('water')]
#resulting df is still upper cased because the str.lower() was only temporary

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [52]:
#theres also the .startswith() and .endswith() methods that can be used
emp[emp['Position Title'].str.lower().str.startswith('water')]
#here we pass in the boolean series that returns True/False whether a position title starts with water

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [54]:
emp[emp['Position Title'].str.lower().str.endswith('ist')]
#returns a new df based on the boolean series created by the endswith('ist') method

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


### More DataFrame String Methods .strip(), .lstrip(), and .rstrip()

In [55]:
emp = pd.read_csv('./data/chicago.csv').dropna(how = 'all')
emp['Department'] = emp['Department'].astype('category')
emp.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [56]:
#strip is used to remove whitespace from a beginning or end of a cell

In [57]:
#normal python
'     hello world      '.lstrip()

'hello world      '

In [59]:
'     hello world      '.strip() #the whitespace between the two words is still there

'hello world'

In [62]:
emp['Name'] = emp['Name'].str.strip()

In [65]:
emp['Position Title'] = emp['Position Title'].str.strip()

### Invoke String Methods on DataFrame Index and Columns

In [67]:
emp = pd.read_csv('./data/chicago.csv', index_col = 'Name').dropna(how = 'all')
emp['Department'] = emp['Department'].astype('category')
emp.tail(3)
#now the name column is set to the index

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [68]:
emp.index
#returns an array of all index values which you can pass all familiar string methods

Index(['AARON,  ELVIA J', 'AARON,  JEFFERY M', 'AARON,  KARINA',
       'AARON,  KIMBERLEI R', 'ABAD JR,  VICENTE M', 'ABARCA,  ANABEL',
       'ABARCA,  EMMANUEL', 'ABASCAL,  REECE E', 'ABBASI,  CHRISTOPHER',
       'ABBATACOLA,  ROBERT J',
       ...
       'ZWIT,  JEFFREY J', 'ZWOLFER,  MATTHEW W', 'ZYCH,  MATEUSZ',
       'ZYDEK,  BRYAN', 'ZYGADLO,  JOHN P', 'ZYGADLO,  MICHAEL J',
       'ZYGOWICZ,  PETER J', 'ZYMANTAS,  MARK E', 'ZYRKOWSKI,  CARLO E',
       'ZYSKOWSKI,  DARIUSZ'],
      dtype='object', name='Name', length=32062)

In [70]:
#can call string methods on the emp.index array
emp.index.str.strip().str.title()

Index(['Aaron,  Elvia J', 'Aaron,  Jeffery M', 'Aaron,  Karina',
       'Aaron,  Kimberlei R', 'Abad Jr,  Vicente M', 'Abarca,  Anabel',
       'Abarca,  Emmanuel', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J',
       ...
       'Zwit,  Jeffrey J', 'Zwolfer,  Matthew W', 'Zych,  Mateusz',
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygowicz,  Peter J', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=32062)

In [71]:
#need to reassign this back to the index
emp.index = emp.index.str.strip().str.title()
emp.index

Index(['Aaron,  Elvia J', 'Aaron,  Jeffery M', 'Aaron,  Karina',
       'Aaron,  Kimberlei R', 'Abad Jr,  Vicente M', 'Abarca,  Anabel',
       'Abarca,  Emmanuel', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J',
       ...
       'Zwit,  Jeffrey J', 'Zwolfer,  Matthew W', 'Zych,  Mateusz',
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygowicz,  Peter J', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=32062)

In [73]:
#same applies to columns using the columns attribute
emp.columns = emp.columns.str.upper()
emp.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


### Split Strings by Characters with the str.split() method

In [74]:
emp = pd.read_csv('./data/chicago.csv').dropna(how = 'all')
emp['Department'] = emp['Department'].astype('category')
emp.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [77]:
#how its used in normal python
'Hello my name is joel'.split(' '), 'you,are,awesome'.split(',')
#splits the string based on the delimiter and returns a list of strings

(['Hello', 'my', 'name', 'is', 'joel'], ['you', 'are', 'awesome'])

In [84]:
#lets say we want to find the most common last name.... we'll need to split the names column
emp['Name'].str.split(',')
#each index is now a commna separated list.
#now we need additional method to extract a specific component of the list
#we use the .str.get() method and pass in the 0 index to get the last name
emp['Name'].str.split(',').str.get(0).str.title().value_counts()
#returns new series with last names as the index and the count of each last name as the value

Williams    293
Johnson     244
Smith       241
Brown       185
Jones       183
           ... 
Tatar         1
Dunmars       1
Goudie        1
Pullen        1
Adamow        1
Name: Name, Length: 13829, dtype: int64

In [90]:
emp['Position Title'].str.split(' ')
#gives us a n length item list for each index

0                        [WATER, RATE, TAKER]
1                           [POLICE, OFFICER]
2                           [POLICE, OFFICER]
3                [CHIEF, CONTRACT, EXPEDITER]
4                       [CIVIL, ENGINEER, IV]
                         ...                 
32057    [FRM, OF, MACHINISTS, -, AUTOMOTIVE]
32058                       [POLICE, OFFICER]
32059                       [POLICE, OFFICER]
32060                       [POLICE, OFFICER]
32061            [CHIEF, DATA, BASE, ANALYST]
Name: Position Title, Length: 32062, dtype: object

In [92]:
emp['Position Title'].str.title().str.split(' ').str.get(0)

0         Water
1        Police
2        Police
3         Chief
4         Civil
          ...  
32057       Frm
32058    Police
32059    Police
32060    Police
32061     Chief
Name: Position Title, Length: 32062, dtype: object

In [93]:
emp['Position Title'].str.title().str.split(' ').str.get(0).value_counts()

Police             10856
Firefighter-Emt     1509
Sergeant            1186
Pool                 918
Firefighter          810
                   ...  
Assoc                  1
Mayor                  1
Hate                   1
Caps                   1
Photographer           1
Name: Position Title, Length: 320, dtype: int64

### More Practice with the str.split() method on a Series

In [94]:
emp = pd.read_csv('./data/chicago.csv').dropna(how = 'all')
emp['Department'] = emp['Department'].astype('category')
emp.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [95]:
emp['Name'].str.split(',').str.get(0)
#new series with only last names

0            AARON
1            AARON
2            AARON
3            AARON
4          ABAD JR
           ...    
32057      ZYGADLO
32058     ZYGOWICZ
32059     ZYMANTAS
32060    ZYRKOWSKI
32061    ZYSKOWSKI
Name: Name, Length: 32062, dtype: object

In [103]:
#want to get most common first names
#some names have an initial after them, some do not so its a bit trickier
#there is whitespace between the comma and the name characters so we need to strip it out.
emp['Name'].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0).value_counts()

MICHAEL     1153
JOHN         899
JAMES        676
ROBERT       622
JOSEPH       537
            ... 
MUDHAR         1
JOYLANDA       1
SOCRATES       1
DIEM           1
ASHFORD        1
Name: Name, Length: 5091, dtype: int64

### Exploring the expand and n Parameters of the str.split() method

In [104]:
emp = pd.read_csv('./data/chicago.csv').dropna(how = 'all')
emp['Department'] = emp['Department'].astype('category')
emp.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [105]:
emp['Name'].str.split(',')

0            [AARON,   ELVIA J]
1          [AARON,   JEFFERY M]
2             [AARON,   KARINA]
3        [AARON,   KIMBERLEI R]
4        [ABAD JR,   VICENTE M]
                  ...          
32057    [ZYGADLO,   MICHAEL J]
32058     [ZYGOWICZ,   PETER J]
32059      [ZYMANTAS,   MARK E]
32060    [ZYRKOWSKI,   CARLO E]
32061    [ZYSKOWSKI,   DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [106]:
emp['Name'].str.split(',', expand = True)
#setting expand = True returns a new df rather than a series

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M
...,...,...
32057,ZYGADLO,MICHAEL J
32058,ZYGOWICZ,PETER J
32059,ZYMANTAS,MARK E
32060,ZYRKOWSKI,CARLO E


In [107]:
#can pass this output df back into the intial df
emp[['First Name', 'Last Name']] = emp['Name'].str.split(',',expand = True)
#now the initial df will have first name and last name columns on the far right of the df

In [108]:
emp.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [110]:
emp['Position Title'].str.split(' ', expand = True)
#kind of a weird output df.  there must be a title with 9 values in it.

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


In [112]:
#lets limit the number of splits by setting n = 2.
#n is the number of splits so there will be n+1 number of columns
emp['Position Title'].str.split(' ', expand = True, n = 2)

Unnamed: 0,0,1,2
0,WATER,RATE,TAKER
1,POLICE,OFFICER,
2,POLICE,OFFICER,
3,CHIEF,CONTRACT,EXPEDITER
4,CIVIL,ENGINEER,IV
...,...,...,...
32057,FRM,OF,MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER,
32059,POLICE,OFFICER,
32060,POLICE,OFFICER,


In [113]:
emp['Position Title'].str.split(' ', expand = True, n = 1)
#here there will only be 1 split of the column so the return will be a new df with 2 columns

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV
...,...,...
32057,FRM,OF MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER
32059,POLICE,OFFICER
32060,POLICE,OFFICER


In [115]:
emp[['First Title Word', 'Remaining Words']] = emp['Position Title'].str.split(' ', expand = True, n = 1)
emp.head()
#assigned the values of the split to new columns at the far right of the df.

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
