## Setup

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

import utils_06 as utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 01 Dataset

In [2]:
file_name = 'data/chicago.csv'
chicago = pd.read_csv(file_name)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


## 02 String methods

In [None]:
# String methods to change case of Position Title
chicago['Position Title'].str.title().head()

0            Water Rate Taker
1              Police Officer
2              Police Officer
3    Chief Contract Expediter
4           Civil Engineer Iv
Name: Position Title, dtype: object

In [7]:
# Change MNGMT to Management in Department
chicago['Department'].str.replace('MGMNT', 'Management')

0        WATER Management
1                  POLICE
2                  POLICE
3        GENERAL SERVICES
4        WATER Management
               ...       
32058              POLICE
32059              POLICE
32060              POLICE
32061                DoIT
32062                 NaN
Name: Department, Length: 32063, dtype: object

In [9]:
# Extract all entries from chicago so Position Title  contains 'water'
chicago = chicago.dropna(subset=['Position Title'])
mask = chicago['Position Title'].str.lower().str.contains('water')
chicago[mask].head()


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [10]:
# Extract all entries from chicago so Position Title starts with civil
mask = chicago['Position Title'].str.lower().str.startswith('civil')
chicago[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
25,"ABDULSATTAR, MUDHAR",CIVIL ENGINEER II,WATER MGMNT,$58536.00
34,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
55,"ABUTALEB, AHMAD H",CIVIL ENGINEER II,WATER MGMNT,$89676.00
147,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,$106836.00


In [11]:
# Extract all entries from chicago so Position Title ends with 'IV'
mask = chicago['Position Title'].str.lower().str.endswith('iv')
chicago[mask].head()


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
34,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
145,"ADAMS, SHERYLL A",LIBRARIAN IV,PUBLIC LIBRARY,$97812.00
147,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
166,"ADENI, MOHAMED K",ACCOUNTANT IV,FINANCE,$97812.00


In [4]:
chicago['Position Title'].str.split(' ')

0                [WATER, RATE, TAKER]
1                   [POLICE, OFFICER]
2                   [POLICE, OFFICER]
3        [CHIEF, CONTRACT, EXPEDITER]
4               [CIVIL, ENGINEER, IV]
                     ...             
32058               [POLICE, OFFICER]
32059               [POLICE, OFFICER]
32060               [POLICE, OFFICER]
32061    [CHIEF, DATA, BASE, ANALYST]
32062                             NaN
Name: Position Title, Length: 32063, dtype: object

In [5]:
chicago['Position Title'].str.split(' ').str.get(0)

0         WATER
1        POLICE
2        POLICE
3         CHIEF
4         CIVIL
          ...  
32058    POLICE
32059    POLICE
32060    POLICE
32061     CHIEF
32062       NaN
Name: Position Title, Length: 32063, dtype: object

### Find the most common first name

In [14]:
chicago['Name'].head(10)

0          AARON,  ELVIA J
1        AARON,  JEFFERY M
2           AARON,  KARINA
3      AARON,  KIMBERLEI R
4      ABAD JR,  VICENTE M
5          ABARCA,  ANABEL
6        ABARCA,  EMMANUEL
7        ABASCAL,  REECE E
8     ABBASI,  CHRISTOPHER
9    ABBATACOLA,  ROBERT J
Name: Name, dtype: object

In [12]:
(chicago['Name']
 .str.split(',')
 .str.get(1)
 .str.strip()
 .str.split()
 .str.get(0)
 .str.strip()
 .value_counts()
 .sort_values(ascending=False)
 .head(10))  

Name
MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
DAVID       506
THOMAS      490
DANIEL      472
WILLIAM     397
ANTHONY     385
Name: count, dtype: int64

In [17]:
chicago['Name'].head(10)

0          AARON,  ELVIA J
1        AARON,  JEFFERY M
2           AARON,  KARINA
3      AARON,  KIMBERLEI R
4      ABAD JR,  VICENTE M
5          ABARCA,  ANABEL
6        ABARCA,  EMMANUEL
7        ABASCAL,  REECE E
8     ABBASI,  CHRISTOPHER
9    ABBATACOLA,  ROBERT J
Name: Name, dtype: object

In [None]:
# Count nans in Name column
chicago['Name'].isna().sum()

np.int64(1)

In [21]:
chicago = chicago.dropna(subset=['Name'])

In [22]:
# Count nans in Name column
chicago['Name'].isna().sum()

np.int64(0)

In [28]:
extractor = utils.FirstNameExtractor()
chicago['First Name'] = chicago['Name'].apply(extractor.extract_first_name)
chicago['First Name'].value_counts().sort_values(ascending=False).head(10)

First Name
Michael    1153
John        899
James       676
Robert      622
Joseph      537
David       506
Thomas      490
Daniel      472
William     397
Anthony     385
Name: count, dtype: int64

## 03 Coding challenge

In [30]:
file_name = "data/customers.csv"
customers = pd.read_csv(file_name)
customers.head()

Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."


In [None]:
# Our customers data set includes an Address column. Each address consists of a street,
# a city, a state, and a zip code. Your challenge is to separate these four values; assign
# them to new Street, City, State, and Zip columns; and then remove the Address col-
# umn.

In [31]:
columns = ['Street', 'City', 'State', 'Zip']
customers[columns] = customers['Address'].str.split(',', expand=True)
customers = customers.drop(columns=['Address'])
customers.head()

Unnamed: 0,Name,Street,City,State,Zip
0,Frank Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,Elizabeth Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,Donald Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,Michael Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,Jasmine Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252
