In [42]:
import pandas as pd

In [43]:
df = pd.read_csv('chicago.csv')

In [44]:
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [46]:
df.dropna(inplace=True)

In [47]:
df['Department'].nunique()

35

In [48]:
df['Department'].count()

32062

* We can see that only 35 of the unique values have been repeated as records 32000 times,   
 to save memory space we can convert it as category type value

In [49]:
df['Department'] = df['Department'].astype('category')

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 1.0+ MB


We can observe that we saved 300 KBs in memory space.

In [51]:
df['Department'].str.lower().head()

0         water mgmnt
1              police
2              police
3    general services
4         water mgmnt
Name: Department, dtype: object

In [52]:
df['Department'].str.title().head()

0         Water Mgmnt
1              Police
2              Police
3    General Services
4         Water Mgmnt
Name: Department, dtype: object

In [53]:
df['Department'].str.len().head()

0    11
1     6
2     6
3    16
4    11
Name: Department, dtype: int64

In [54]:
df['Department'].str.replace('MGMNT','MANAGEMENT').head()

0    WATER MANAGEMENT
1              POLICE
2              POLICE
3    GENERAL SERVICES
4    WATER MANAGEMENT
Name: Department, dtype: object

In [55]:
df['Employee Annual Salary'].dtype

dtype('O')

We can observe that **Employee Annual Salary** has been parsed as  string (objects) because of the **$** sign at the beginning.

In [56]:
df['Employee Annual Salary'] = df['Employee Annual Salary'].str.replace('$','').astype('float')

In [57]:
df['Employee Annual Salary'].dtype

dtype('float64')

In [58]:
df['Employee Annual Salary'].nlargest(3)

8184     300000.0
7954     216210.0
25532    202728.0
Name: Employee Annual Salary, dtype: float64

In [59]:
mask = df['Position Title'].str.contains("WATER")

In [60]:
df[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,102440.0
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,82044.0
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,109272.0
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,102440.0


In [61]:
df[df['Position Title'].str.lower().str.startswith('water')].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,82044.0
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,82044.0
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,82044.0
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,53172.0


In [62]:
df[df['Position Title'].str.lower().str.endswith('ist')].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,99840.0
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,81948.0
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,89880.0
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,94328.0
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,91476.0


In [63]:
st = '  I AM BATMAN  '

In [64]:
st.lstrip()

'I AM BATMAN  '

In [65]:
st.rstrip()

'  I AM BATMAN'

In [66]:
st.strip()

'I AM BATMAN'

In [67]:
df['Position Title'] = df['Position Title'].str.strip()

In [68]:
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [69]:
df.index = df['Name'].str.title().str.strip()

In [70]:
df.drop('Name', axis = 1, inplace= True)

In [71]:
df.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,90744.0
"Aaron, Jeffery M",POLICE OFFICER,POLICE,84450.0
"Aaron, Karina",POLICE OFFICER,POLICE,84450.0
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [72]:
df.columns = df.columns.str.strip().str.upper()

In [73]:
df.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,90744.0
"Aaron, Jeffery M",POLICE OFFICER,POLICE,84450.0
"Aaron, Karina",POLICE OFFICER,POLICE,84450.0
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [74]:
df.reset_index(inplace= True)

In [75]:
df.insert(loc = 0, column = 'FIRST NAME', value = df['Name'].str.split(',').str.get(0))

In [76]:
df.insert(loc = 1, column = 'LAST NAME', value = df['Name'].str.split(',').str.get(1).str.strip().str.split(" ").str.get(0))

In [78]:
df.head()

Unnamed: 0,FIRST NAME,LAST NAME,Name,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
0,Aaron,Elvia,"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,90744.0
1,Aaron,Jeffery,"Aaron, Jeffery M",POLICE OFFICER,POLICE,84450.0
2,Aaron,Karina,"Aaron, Karina",POLICE OFFICER,POLICE,84450.0
3,Aaron,Kimberlei,"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,Abad Jr,Vicente,"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


Alternate for splitting the names and adding to dataframe can be done as:

In [79]:
df['Name'].str.split(',', expand = True).head()

Unnamed: 0,0,1
0,Aaron,Elvia J
1,Aaron,Jeffery M
2,Aaron,Karina
3,Aaron,Kimberlei R
4,Abad Jr,Vicente M


**expand** parameter returns a dataframe rather thyan a list of list values.

In [80]:
df.drop('Name',axis  ='columns', inplace = True)

In [81]:
df.head()

Unnamed: 0,FIRST NAME,LAST NAME,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
0,Aaron,Elvia,WATER RATE TAKER,WATER MGMNT,90744.0
1,Aaron,Jeffery,POLICE OFFICER,POLICE,84450.0
2,Aaron,Karina,POLICE OFFICER,POLICE,84450.0
3,Aaron,Kimberlei,CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,Abad Jr,Vicente,CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [85]:
df['POSITION TITLE'].str.split(' ', expand = True).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
5,ASST,TO,THE,ALDERMAN,,,,,
6,GENERAL,LABORER,-,DSS,,,,,
7,TRAFFIC,CONTROL,AIDE-HOURLY,,,,,,
8,STAFF,ASST,TO,THE,ALDERMAN,,,,
9,ELECTRICAL,MECHANIC,,,,,,,


In [86]:
df['POSITION TITLE'].str.split(' ', expand =True, n = 4)

Unnamed: 0,0,1,2,3,4
0,WATER,RATE,TAKER,,
1,POLICE,OFFICER,,,
2,POLICE,OFFICER,,,
3,CHIEF,CONTRACT,EXPEDITER,,
4,CIVIL,ENGINEER,IV,,
5,ASST,TO,THE,ALDERMAN,
6,GENERAL,LABORER,-,DSS,
7,TRAFFIC,CONTROL,AIDE-HOURLY,,
8,STAFF,ASST,TO,THE,ALDERMAN
9,ELECTRICAL,MECHANIC,,,
