<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_working_with_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with text

In [1]:
# libraries needed
import numpy as np
import pandas as pd

In [2]:
# load data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [3]:
# number of rows and columns
chicago.shape

(32063, 4)

In [4]:
# data types, non-null counts, and memory usage
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [5]:
# number of unique values for each column/Series
chicago.nunique()

# convert 'Department' to category

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [6]:
# convert 'Department' to category for memory optimization
chicago['Department'] = chicago['Department'].astype('category')

# memory savings
chicago.info()       # from 1002 kb to 784 kb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


In [7]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

## Common string methods
- .lower() method: convert all characters to lower case
- .upper() method: convert all characters to upper case
- .title() method: capitalize only the first letter of each word, based on space
- len() function or df['col'].str.len(): count number of characters in a string
- for a series, need .str pre-fix
  - df['col'].str.len()
  - df['col'].str.upper()
  - df['col'].str.lower()
  - df['col'].str.title()


In [8]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [9]:
# regular python stirngs

# convert all to lower case
'Some Characters are in UPPER CASE others are in lower case'.lower()

'some characters are in upper case others are in lower case'

In [10]:
# convert all to upper case
'Some Characters are in UPPER CASE others are in lower case'.upper()

'SOME CHARACTERS ARE IN UPPER CASE OTHERS ARE IN LOWER CASE'

In [11]:
# capitalize first letter of each word; based on spaces
'Some Characters are in UPPER CASE others are in lower case'.title()

'Some Characters Are In Upper Case Others Are In Lower Case'

In [12]:
# number of characters; not a method that's called; it's a built-in function
len('this')

4

In [13]:
# convert 'Name' Series/column to lowercase
chicago['Name'].str.lower()                      # .str precursor; str for string

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32057    zygadlo,  michael j
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
Name: Name, Length: 32062, dtype: object

In [14]:
# capitalize first letter in 'Name'
chicago['Name'] = chicago['Name'].str.title()                     # need .str precursor

# examine
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
2,"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
3,"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [15]:
# number of characters in each name
chicago['Name'].apply(len)

0        15
1        17
2        14
3        19
4        19
         ..
32057    19
32058    18
32059    17
32060    19
32061    19
Name: Name, Length: 32062, dtype: int64

In [16]:
# above is equivalent to:
chicago['Name'].str.len()

0        15
1        17
2        14
3        19
4        19
         ..
32057    19
32058    18
32059    17
32060    19
32061    19
Name: Name, Length: 32062, dtype: int64

## .str.replace() method

In [17]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [18]:
# replace lowercase l with uppercase L
'hello world'.replace('l', 'L')

'heLLo worLd'

In [19]:
# replace h with space ''
'hello world'.replace('h', '')

'ello world'

In [20]:
# in 'Department', replace 'MGMNT' with 'MANAGEMENT'
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT').astype('category')   # .str.replace() turns it back to a string; turn it into category again

# examine
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [21]:
# convert 'Employee Annual Salary' to float
chicago['Employee Annual Salary'] = (
    chicago['Employee Annual Salary']
      .str.replace('$', '')                  # replace '$' with '', i.e., nothing
      .astype('float')                       # convert to float
)

chicago.head()

  chicago['Employee Annual Salary']


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [22]:
# check that it's a float
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  float64 
dtypes: category(1), float64(1), object(2)
memory usage: 1.0+ MB


In [23]:
# total spend on all employees
chicago['Employee Annual Salary'].sum()

2571506375.36

In [24]:
# average employee salary
chicago['Employee Annual Salary'].mean()

80204.17863389682

In [25]:
# 10 largest salaries
chicago['Employee Annual Salary'].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

In [26]:
# who's making 300k?
chicago.iloc[8184]

Name                              EVANS,  GINGER S
Position Title            COMMISSIONER OF AVIATION
Department                                AVIATION
Employee Annual Salary                    300000.0
Name: 8184, dtype: object

## filtering with string methods
- .str.contains()
- .str.startswith()
- .str.endswith()
- best practice to precede with .str.lower()
  - equivalent to SQL LOWER(string) ILIKE '%looking for%'

In [27]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [28]:
# extract rows where 'Position' contains 'WATER'
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [29]:
# extract rows where 'Position' contains 'WATER'
(
    chicago
      [
          chicago['Position Title']
            .str.lower()                     # convert strings to lower case; this is best practice
            .str.contains('water')           # boolean series for whether string contians 'water'
      ]
)

# equivalent to SQL LOWER(string) LIKE '%water%'

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [30]:
# water at beginning of 'Position Title'
(
    chicago
    [
        chicago['Position Title']
          .str.lower()                  # convert 'Position Title' to lower case
          .str.startswith('water')      # boolean series for whether 'Position Title' starts with 'water'
    ]
)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [31]:
# 'Position Title' ends with 'ist'
(
    chicago
      [
          chicago['Position Title']
            .str.lower()                   # convert 'Position Title' to lower case
            .str.endswith('ist')           # boolean for whether 'Position Title' ends with 'ist' (True) or not (False)
      ]
)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


## .strip(), .lstrip(), .rstrip()
- str.lstrip() removes spaces from left, i.e., leading
- str.rstrip() removes spaces from the right, i.e., trailing
- str.strip() removes spaces from left and right

In [32]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [33]:
# spaces on left/leading and right/trailing
with_spaces = '    hello world    '

In [34]:
# remove leading/left spaces
with_spaces.lstrip()

'hello world    '

In [35]:
# remove trailing/right spaces
with_spaces.rstrip()

'    hello world'

In [36]:
# remove both leading and trailing white space
with_spaces.strip()

'hello world'

In [37]:
# apply to a Series in a DataFrame
(
    chicago['Name']
      .str.lower()     # convert to lower case
      .str.strip()     # remove trailing and leading white spaces
)

# can assign above back to series to replace column/Series

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32057    zygadlo,  michael j
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
Name: Name, Length: 32062, dtype: object

## Invoking string methods on index and columns of a DataFrame
- same methods as above, but on:
  - df.index
  - df.columns

In [38]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [39]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [40]:
# make 'Name' the index
chicago = (
    chicago
      .set_index('Name')
)

chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [41]:
# change column names to lower case and replace spaces with underscores
chicago.columns = (
    chicago.columns
      .str.lower()                # convert column names to lower case
      .str.replace(' ', '_')      # replace spaces with underscores
)

chicago.head()

Unnamed: 0_level_0,position_title,department,employee_annual_salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [42]:
# change index names to capitalize first character of each word
chicago.index = (
    chicago.index
      .str.strip()       # remove leading and trailing white space
      .str.title()       # convert to lower case
)

chicago.head()

Unnamed: 0_level_0,position_title,department,employee_annual_salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


## .str.split() method
- Use in conjunction with str.get() where paramter is index position

In [44]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [46]:
# split string at white space
"Hello, my name is Jack".split()    # default is ' ', i.e., white space

['Hello,', 'my', 'name', 'is', 'Jack']

In [47]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [48]:
# most common last name?

In [55]:
(
    chicago['Name']
      .str.split(',')           # Series where each value is a 2 item list; comma separates items in list
      .str.get(0)               # extract first value from each list, index position 0
      .str.title()              # capitalize first letter, lower case other letters
      .value_counts()           # count how many times each last name appears
      [:10]                     # top 10 most common last names
)

# Williams

Williams     293
Johnson      244
Smith        241
Brown        185
Jones        183
Rodriguez    171
Jackson      136
Garcia       130
Davis        127
Hernandez    110
Name: Name, dtype: int64

In [62]:
# most common first word in 'Position Title' column/Series?
(
    chicago['Position Title']
      .str.split()                # split 'Position Title' at white space; returns a Series where values are lists
      .str.get(0)                 # get the value at index position 0 in each list
      .str.lower()                # convert those values to lower case
      .value_counts()             # count how many times each value occurs
      [:10]                       # top 10
)

# police

police             10856
firefighter-emt     1509
sergeant            1186
pool                 918
firefighter          810
crossing             775
motor                721
sanitation           715
paramedic            641
asst                 606
Name: Position Title, dtype: int64

## More .str.split() practice

In [71]:
# what is the most common first name?
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [91]:
(
    chicago['Name']
      .str.strip()          # remove trailing and leading white space
      .str.split(',')       # split at comma; index position 0 is last name; index position 1 is first name and middle initial
      .str.get(1)           # get first name and middle initial, separated by white space
      .str.strip()          # remove trailing and leading white space around first name and middle initial
      .str.split()          # split at white space; index position 0 is first name, index position 1 is middle initial, if it exists
      .str.get(0)           # get first name
      .str.lower()          # convert to lower case
      .value_counts()       # count how many times each first name occurs
      [:10]                 # top 10
)

michael    1153
john        899
james       676
robert      622
joseph      537
david       506
thomas      490
daniel      472
william     397
anthony     385
Name: Name, dtype: int64

In [None]:
# unable to reproduce error without .str.split() for white space around first name

In [124]:
(
    chicago['Name']
      [:10]
      .str.split(',')
      .str.get(1)
      .str.split()
      .str.get(0)
)

0          ELVIA
1        JEFFERY
2         KARINA
3      KIMBERLEI
4        VICENTE
5         ANABEL
6       EMMANUEL
7          REECE
8    CHRISTOPHER
9         ROBERT
Name: Name, dtype: object

## *expand* and *n* parameters of str.split()
- by default, expand = False; when expand = True, a DataFrame is returned
- n refers to number of splits; can limit this to just 1 to split at first delimiter, which is white space by default
  - e.g., 'water meter taker' would be 'water' and 'meter taker'

In [125]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [126]:
# by default, expand = False, doesn't return a DataFrame
(
    chicago['Name']
      .str.split(',')
)

0            [AARON,   ELVIA J]
1          [AARON,   JEFFERY M]
2             [AARON,   KARINA]
3        [AARON,   KIMBERLEI R]
4        [ABAD JR,   VICENTE M]
                  ...          
32057    [ZYGADLO,   MICHAEL J]
32058     [ZYGOWICZ,   PETER J]
32059      [ZYMANTAS,   MARK E]
32060    [ZYRKOWSKI,   CARLO E]
32061    [ZYSKOWSKI,   DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [127]:
# when expand = True, returns a DataFrame
(
    chicago['Name']
      .str.split(',', expand = True)
)

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M
...,...,...
32057,ZYGADLO,MICHAEL J
32058,ZYGOWICZ,PETER J
32059,ZYMANTAS,MARK E
32060,ZYRKOWSKI,CARLO E


In [128]:
# assign split strings back to DataFrame
chicago[['last_name', 'first_name_middle_initial']] = (
    chicago['Name']
      .str.split(',', expand = True)
)

chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,last_name,first_name_middle_initial
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [131]:
# get a large dataframe with many columns
(
    chicago['Position Title']
      .str.split(expand = True)
)

# due to different number of spaces/delimiters in each entry of 'Position Title'
# so max is 9 "words"

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


In [133]:
# just 1 split; first delimiter
(
    chicago['Position Title']
      .str.split(expand = True, n = 1)    # n = 1 refers to just 1 split at first space
)

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV
...,...,...
32057,FRM,OF MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER
32059,POLICE,OFFICER
32060,POLICE,OFFICER


In [None]:
## end of this section