In [1]:
import pandas as pd

In [3]:
chicago = pd.read_csv("chicago.csv")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [5]:
chicago["Department"].nunique()

35

In [6]:
chicago["Department"].count()

32062

In [7]:
chicago["Department"] = chicago["Department"].astype("category")

In [8]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


# Common String Methods .lower() .upper() .title() .len()

In [9]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [10]:
chicago["Name"].str.lower().head(3)

0      aaron,  elvia j
1    aaron,  jeffery m
2       aaron,  karina
Name: Name, dtype: object

In [12]:
chicago["Name"].str.lower().str.upper().head(3)

0      AARON,  ELVIA J
1    AARON,  JEFFERY M
2       AARON,  KARINA
Name: Name, dtype: object

In [13]:
chicago["Name"].str.title().head(3)

0      Aaron,  Elvia J
1    Aaron,  Jeffery M
2       Aaron,  Karina
Name: Name, dtype: object

In [14]:
chicago["Position Title"] = chicago["Position Title"].str.title()
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


In [15]:
chicago["Department"].str.len().head(3)

0    11.0
1     6.0
2     6.0
Name: Department, dtype: float64

# .str.replace()

In [16]:
chicago["Department"].str.replace("MGMNT", "MANAGEMENT").head(3)

0    WATER MANAGEMENT
1              POLICE
2              POLICE
Name: Department, dtype: object

In [17]:
chicago["Department"] = chicago["Department"].str.replace("MGMNT", "MANAGEMENT")

In [18]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [19]:
chicago["Department"] = chicago["Department"].astype("category")

In [20]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


In [21]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype(float)

In [22]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null float64
dtypes: category(1), float64(1), object(2)
memory usage: 784.4+ KB


# Filtering with String Methods

In [23]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",Police Officer,POLICE,84450.0
2,"AARON, KARINA",Police Officer,POLICE,84450.0


In [27]:
mask = chicago["Position Title"].str.lower().str.contains("water")

In [30]:
mask.tail(3)

32060    False
32061    False
32062      NaN
Name: Position Title, dtype: object

In [34]:
chicago.dropna(how='all', inplace=True)

In [37]:
mask = chicago["Position Title"].str.lower().str.contains("water")
chicago[mask].count()

Name                      111
Position Title            111
Department                111
Employee Annual Salary    111
dtype: int64

In [38]:
mask = chicago["Position Title"].str.lower().str.startswith("water")
chicago[mask].count()

Name                      75
Position Title            75
Department                75
Employee Annual Salary    75
dtype: int64

# .strip(), lstrip(), rstrip()

In [39]:
chicago["Name"] = chicago["Name"].str.lstrip().str.rstrip()

In [40]:
chicago["Position Title"] = chicago["Position Title"].str.strip()

# String methods in index and columns

In [43]:
chicago = pd.read_csv("chicago.csv", index_col="Name").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [45]:
chicago.index = chicago.index.str.strip().str.title()

In [46]:
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [47]:
chicago.columns = chicago.columns.str.upper()

In [48]:
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


# Spliting

In [49]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [57]:
chicago["F Name"] = chicago["Name"].str.split(",").str.get(0).str.strip()

In [58]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,F Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON


In [59]:
chicago["L Name"] = chicago["Name"].str.split(",").str.get(1).str.strip().str.split(" ").str.get(0)

In [60]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,F Name,L Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [61]:
chicago["M Name"] = chicago["Name"].str.split(",").str.get(1).str.strip().str.split(" ").str.get(1)

In [64]:
chicago.drop("Name", axis=1, inplace=True)

In [65]:
chicago.head(3)

Unnamed: 0,Position Title,Department,Employee Annual Salary,F Name,L Name,M Name
0,WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA,J
1,POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY,M
2,POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,


In [66]:
chicago["Position Title"].str.split(" ").str.get(0).value_counts()

POLICE                   10856
FIREFIGHTER-EMT           1509
SERGEANT                  1186
POOL                       918
FIREFIGHTER                810
CROSSING                   775
MOTOR                      721
SANITATION                 715
PARAMEDIC                  641
ASST                       606
TRAFFIC                    512
FIRE                       512
SENIOR                     470
CONSTRUCTION               452
LIEUTENANT-EMT             394
ADMINISTRATIVE             375
LIBRARY                    365
LIBRARIAN                  335
LIEUTENANT                 332
OPERATING                  324
ELECTRICAL                 313
AVIATION                   309
FIREFIGHTER/PARAMEDIC      259
GENERAL                    257
STAFF                      250
CLERK                      242
FOREMAN                    237
HOISTING                   214
DEPUTY                     213
LABORER                    210
                         ...  
EQUAL                        1
TELECOMM

# More splits, expand and n parameters

In [70]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [74]:
chicago[["F Name", "Last Name"]] = chicago["Name"].str.split(",", expand=True)

In [75]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,F Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [78]:
chicago["Position Start"] = chicago["Position Title"].str.split(" ", expand=True)[0]

In [79]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,F Name,Last Name,Position Start
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE


In [83]:
chicago["Position Title"].str.split(" ", expand=True, n=1).head(3)

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER


In [84]:
chicago[["Position Start", "Position rest"]] = chicago["Position Title"].str.split(" ", expand=True, n=1)

In [85]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,F Name,Last Name,Position Start,Position rest
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
