<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_working_with_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with text

In [1]:
# libraries needed
import numpy as np
import pandas as pd

In [2]:
# load data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
# number of rows and columns
chicago.shape

(32063, 4)

In [5]:
# data types, non-null counts, and memory usage
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [7]:
# number of unique values for each column/Series
chicago.nunique()

# convert 'Department' to category

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [8]:
# convert 'Department' to category for memory optimization
chicago['Department'] = chicago['Department'].astype('category')

# memory savings
chicago.info()       # from 1002 kb to 784 kb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


In [10]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


## Common string methods
- .lower()
- .upper()
- title -- what is this?
- len() -- may need to adjust syntax

In [11]:
# import data
chicago = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/chicago.csv'
)

# convert 'Department' to category for memory savings
chicago['Department'] = chicago['Department'].astype('category')

# drop rows where all values are missing
chicago = chicago.dropna(how = 'all')