<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_working_with_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with text

In [2]:
# libraries needed
import numpy as np
import pandas as pd

In [3]:
# load data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
# number of rows and columns
chicago.shape

(32063, 4)

In [5]:
# data types, non-null counts, and memory usage
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [6]:
# number of unique values for each column/Series
chicago.nunique()

# convert 'Department' to category

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [7]:
# convert 'Department' to category for memory optimization
chicago['Department'] = chicago['Department'].astype('category')

# memory savings
chicago.info()       # from 1002 kb to 784 kb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


In [8]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

## Common string methods
- .lower() method: convert all characters to lower case
- .upper() method: convert all characters to upper case
- .title() method: capitalize only the first letter of each word, based on space
- len() function or df['col'].str.len(): count number of characters in a string
- for a series, need .str pre-fix
  - df['col'].str.len()
  - df['col'].str.upper()
  - df['col'].str.lower()
  - df['col'].str.title()


In [9]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [42]:
# regular python stirngs

# convert all to lower case
'Some Characters are in UPPER CASE others are in lower case'.lower()

AttributeError: ignored

In [33]:
# convert all to upper case
'Some Characters are in UPPER CASE others are in lower case'.upper()

'SOME CHARACTERS ARE IN UPPER CASE OTHERS ARE IN LOWER CASE'

In [34]:
# capitalize first letter of each word; based on spaces
'Some Characters are in UPPER CASE others are in lower case'.title()

'Some Characters Are In Upper Case Others Are In Lower Case'

In [36]:
# number of characters; not a method that's called; it's a built-in function
len('this')

4

In [41]:
# convert 'Name' Series/column to lowercase
chicago['Name'].str.lower()                      # .str precursor; str for string

0            aaron,  elvia j
1          aaron,  jeffery m
2             aaron,  karina
3        aaron,  kimberlei r
4        abad jr,  vicente m
                ...         
32057    zygadlo,  michael j
32058     zygowicz,  peter j
32059      zymantas,  mark e
32060    zyrkowski,  carlo e
32061    zyskowski,  dariusz
Name: Name, Length: 32062, dtype: object

In [46]:
# capitalize first letter in 'Name'
chicago['Name'] = chicago['Name'].str.title()                     # need .str precursor

# examine
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
2,"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
3,"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [44]:
# number of characters in each name
chicago['Name'].apply(len)

0        15
1        17
2        14
3        19
4        19
         ..
32057    19
32058    18
32059    17
32060    19
32061    19
Name: Name, Length: 32062, dtype: int64

In [47]:
# above is equivalent to:
chicago['Name'].str.len()

0        15
1        17
2        14
3        19
4        19
         ..
32057    19
32058    18
32059    17
32060    19
32061    19
Name: Name, Length: 32062, dtype: int64

## .str.replace() method

In [67]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [68]:
# replace lowercase l with uppercase L
'hello world'.replace('l', 'L')

'heLLo worLd'

In [69]:
# replace h with space ''
'hello world'.replace('h', '')

'ello world'

In [70]:
# in 'Department', replace 'MGMNT' with 'MANAGEMENT'
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT').astype('category')   # .str.replace() turns it back to a string; turn it into category again

# examine
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [71]:
# convert 'Employee Annual Salary' to float
chicago['Employee Annual Salary'] = (
    chicago['Employee Annual Salary']
      .str.replace('$', '')                  # replace '$' with '', i.e., nothing
      .astype('float')                       # convert to float
)

chicago.head()

  chicago['Employee Annual Salary']


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [72]:
# check that it's a float
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  float64 
dtypes: category(1), float64(1), object(2)
memory usage: 1.0+ MB


In [73]:
# total spend on all employees
chicago['Employee Annual Salary'].sum()

2571506375.36

In [74]:
# average employee salary
chicago['Employee Annual Salary'].mean()

80204.17863389682

In [76]:
# 10 largest salaries
chicago['Employee Annual Salary'].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

In [77]:
# who's making 300k?
chicago.iloc[8184]

Name                              EVANS,  GINGER S
Position Title            COMMISSIONER OF AVIATION
Department                                AVIATION
Employee Annual Salary                    300000.0
Name: 8184, dtype: object

## filtering with string methods
- .str.contains()
- .str.startswith()
- .str.endswith()
- best practice to precede with .str.lower()
  - equivalent to SQL LOWER(string) ILIKE '%looking for%'

In [78]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

In [79]:
# extract rows where 'Position' contains 'WATER'
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [87]:
# extract rows where 'Position' contains 'WATER'
(
    chicago
      [
          chicago['Position Title']
            .str.lower()                     # convert strings to lower case; this is best practice
            .str.contains('water')           # boolean series for whether string contians 'water'
      ]
)

# equivalent to SQL LOWER(string) LIKE '%water%'

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [89]:
# water at beginning of 'Position Title'
(
    chicago
    [
        chicago['Position Title']
          .str.lower()                  # convert 'Position Title' to lower case
          .str.startswith('water')      # boolean series for whether 'Position Title' starts with 'water'
    ]
)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [91]:
# 'Position Title' ends with 'ist'
(
    chicago
      [
          chicago['Position Title']
            .str.lower()                   # convert 'Position Title' to lower case
            .str.endswith('ist')           # boolean for whether 'Position Title' ends with 'ist' (True) or not (False)
      ]
)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


## .strip(), .lstrip(), .rstrip()