In [1]:
import pandas as pd

In [3]:
# all public employees in the city of Chicago 
# we can see that the name is stored in one cell, the first and last name is stored as one 
# the position title is in all CAPS 
# and the salary is sotred as text because of the $ which prevents us from during numeric operations 

chicago = pd.read_csv("chicago.csv")
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [4]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [11]:
chicago["Position Title"].value_counts()
chicago["Department"].nunique()

35

In [13]:
# we can see that the Department column only has 35 uniwue values which we then can convert to a category 

chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [15]:
# converting to a category 

chicago["Department"] = chicago["Department"].astype("category")

In [16]:
# reduced the amount of storage 

chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.4+ KB


In [17]:
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00
32062,,,,


## Common String Methods - .lower(), .upper(), .title(), and .len() 

In [3]:
# we are removing the NaN values at the end and converting the department into a category 

chicago = pd.read_csv("chicago.csv").dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [4]:
# converts all the characters to lower case 

"HELLO WORLD".lower()

'hello world'

In [5]:
# capitalizes hello world 

"hello world".upper()

'HELLO WORLD'

In [7]:
# characters in a string 

len("Hello World")

11

In [11]:
# using it on columns 

chicago["Name"].str.lower()

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32057    zygadlo,  michael j
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
Name: Name, Length: 32062, dtype: object

In [12]:
# title is going to capitalize the first letter of each separate word so it makes it look proper 

chicago["Name"].str.title()

0            Aaron,  Elvia J
1          Aaron,  Jeffery M
2             Aaron,  Karina
3        Aaron,  Kimberlei R
4        Abad Jr,  Vicente M
                ...         
32057    Zygadlo,  Michael J
32058     Zygowicz,  Peter J
32059      Zymantas,  Mark E
32060    Zyrkowski,  Carlo E
32061    Zyskowski,  Dariusz
Name: Name, Length: 32062, dtype: object

In [15]:
# much presentable format 

chicago["Position Title"] = chicago["Position Title"].str.title()
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


In [16]:
# number of characters in every single value 

chicago["Department"].str.len()

0        11
1         6
2         6
3        16
4        11
         ..
32057    16
32058     6
32059     6
32060     6
32061     4
Name: Department, Length: 32062, dtype: int64

## The .str.replace() Method 

In [17]:
chicago = pd.read_csv("chicago.csv").dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [18]:
# this replaces all the l's with a !'s

"Hello World".replace("l", "!")

'He!!o Wor!d'

In [19]:
# with all the MGMNT we will replace with management but we have to make sure to attach str to the column 
# that we want to replace with 

chicago["Department"].str.replace("MGMNT", "MANAGEMENT")

0        WATER MANAGEMENT
1                  POLICE
2                  POLICE
3        GENERAL SERVICES
4        WATER MANAGEMENT
               ...       
32057    GENERAL SERVICES
32058              POLICE
32059              POLICE
32060              POLICE
32061                DoIT
Name: Department, Length: 32062, dtype: object

In [23]:
# we want to get rid of the $ in the string and convert it into a float 

chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", '').astype("float")

In [24]:
# we can see that we have converted the employee annual salary into a float so now we can perform 
# mathematical functions to it 

chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0


In [25]:
# top salary 

chicago["Employee Annual Salary"].max()

300000.0

In [26]:
chicago["Employee Annual Salary"].min()

0.96

In [28]:
# top 10 salaries 

chicago["Employee Annual Salary"].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

In [29]:
# 5 lowest salaries 

chicago["Employee Annual Salary"].nsmallest(5)

15102       0.96
12       2756.00
27       2756.00
47       2756.00
295      2756.00
Name: Employee Annual Salary, dtype: float64

## Filtering with String Methods

In [2]:
chicago = pd.read_csv("chicago.csv").dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [6]:
# we want to extract any rows where the position title contains the word 'water'
# all the matches we will want to find are case sensitive 
# the normalized way that we can use to find the word we are looking for is by applying the lower() 


# we will want to do some method chaning here because we want to use the contains method to find 'water'
mask = chicago["Position Title"].str.lower().str.contains("water")

# this will show us the dataframe with all the titles containing water 
chicago[mask]


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [8]:
# this will show us what position title STARTS WITH 'water'

mask1 = chicago['Position Title'].str.lower().str.startswith("water")
chicago[mask1]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [11]:
# nothing will return because nothings in position title ends with water 

mask2 = chicago["Position Title"].str.lower().str.endswith("water")
chicago[mask2]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary


In [12]:
# all the position titles that end with 'ist'
# we are returned the dataframe

mask3 = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask3]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


## More String Methods - .strip(), .lstrip(), and .rstrip()

####  the methods above are used to remove whitespace 

In [3]:
chicago = pd.read_csv("chicago.csv").dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [14]:
# all the whitespace from the left is removed 

"    Hello World  ".lstrip(" ")

'Hello World  '

In [16]:
# all the whitespace from the right is being removed 

"    Hello World  ".rstrip(" ")

'    Hello World'

In [17]:
# this removes all the whitespace from both the left and right 

"    Hello World  ".strip(" ")

'Hello World'

In [11]:
# or you can just use the regular .strip() 
# this method does not overwrite and therefore will need to assign it back 

chicago["Name"] = chicago["Name"].str.rstrip().str.lstrip()
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [14]:
chicago['Position Title'] = chicago['Position Title'].str.strip()
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


## String Methods on Index and Columns  

In [16]:
# we will set the name as the index column 

chicago = pd.read_csv("chicago.csv", index_col = 'Name').dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [20]:
# strip the whitespace and include the title so that it cleans up the way that the names look 

chicago.index = chicago.index.str.strip().str.title()

In [21]:
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [25]:
chicago.columns = chicago.columns.str.upper()
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


## Split Strings by Characters with .str.split() Method 

In [32]:
chicago = pd.read_csv("chicago.csv").dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [27]:
# will return a list with all the words in the string 
# the space we specified will determine where we want to split so that we can make the list 
# the space is the delimiter showing us that we can split the words up when the string comes accross a space 

"Hello my name is Boris".split(" ")

['Hello', 'my', 'name', 'is', 'Boris']

In [39]:
# the get method in this situation will return the item in the 0th position also the first item which is also the 
# last name and then we use value_counts to see the most common last names 

chicago["Name"].str.split(",").str.get(0).str.title().value_counts()

Williams    293
Johnson     244
Smith       241
Brown       185
Jones       183
           ... 
Osagboro      1
Miggins       1
Leano         1
Lydon         1
Klasek        1
Name: Name, Length: 13829, dtype: int64

In [45]:
# will split at the point of a space and create a list 

chicago["Position Title"].str.split(" ").str.get(0).value_counts()

POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
                   ...  
READER                 1
PRINTER                1
PRODUCTION             1
DEVELOPMENT            1
DECK                   1
Name: Position Title, Length: 320, dtype: int64

## More Practice with Splits

In [46]:
chicago = pd.read_csv("chicago.csv").dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [57]:
# most common first name and this has its caveats because the first names might have the middle initials in them 
# we are getting the second item on index 1 
# to separate the First name and the middle we are going to have to strip it again because it has whitespace 
# then we will split again 

chicago["Name"].str.split(",").str.get(1).str.strip().str.split(" ").str.get(0).value_counts().head(5)

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

## The expand and n Parameters of the str.split() Method

In [58]:
chicago = pd.read_csv("chicago.csv").dropna(how = 'all')
chicago["Department"] = chicago["Department"].astype("category")
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [63]:
# the expand will give us a dataframe 

chicago[["First Name", "Last Name"]] = chicago["Name"].str.split(",", expand = True)
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [65]:
# we need to limit the splits 

chicago["Position Title"].str.split(" ", expand = True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


In [68]:
# we have the n parameter that can limit the amount of splits that we want to do within the string 
# we are indicating that we want to do 1 total split by a space 
# the first word is split and then the rest is put into the second column 

chicago[["First Title Word", "Remaining Words"]] = chicago["Position Title"].str.split(" ", expand = True, n = 1)
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
