# Pandas (Part 2) - Cleaning Data - Exercise Solutions

In [None]:
import pandas as pd

pd.options.display.float_format = '{:,.2f}'.format # Display option to include 2 decimal places on all floats
pd.set_option('display.max_columns', None) # Display option to show all columns
pd.set_option('display.max_rows', None) # Display option to show all rows

#### Exercise 1

First, run the cell below to import the **violations.csv** file into the `viol` DataFrame. This is the data set we were working with during our last class period in the exercises.

In [None]:
viol = pd.read_csv('violations.csv',parse_dates=['date'],dtype={'zip':str,'penalty':str,'year':str})
viol.head()

1. Drop the `company` column from `viol`. Print the first five rows of `viol` to ensure it was properly dropped.

In [None]:
# 1
viol = viol.drop(columns=['company'])
viol.head()

2. Create a new DataFrame called `new_viol` that includes only the `ticker`, `date`, and `penalty` columns from `viol`. Print the first five rows of `new_viol`.

In [None]:
# 2
new_viol = viol[['ticker','date','penalty']]
new_viol.head()

#### Exercise 2

Run the cell below to re-import the **violations.csv** file into the `viol` DataFrame.

In [None]:
viol = pd.read_csv('violations.csv',parse_dates=['date'],dtype={'zip':str,'penalty':str,'year':str})

1. Create a new DataFrame containing all duplicate rows in the `viol` DataFrame. How many rows are duplicates in our data?

In [None]:
# 1
dup_rows = viol[viol.duplicated(keep=False)]
print(dup_rows.info())
dup_rows.head()

# 9,596 rows are duplicates in our data

2. Drop duplicate rows (keeping the first duplicate row for each duplicate) from the `viol` DataFrame where a duplicate row is based only on the `ticker` and `date` columns.

In [None]:
# 2
viol = viol.drop_duplicates(['ticker','date'])
viol.info()

#### Exercise 3

Run the cell below to re-import the **violations.csv** file into the `viol` DataFrame.

In [None]:
viol = pd.read_csv('violations.csv',parse_dates=['date'],dtype={'zip':str,'penalty':str,'year':str})

1. Fill in the `zip` column in the `viol` DataFrame with a blank string (i.e., `''`) if the value is missing.

In [None]:
# 1
viol['zip'] = viol['zip'].fillna('')
viol.head()

2. Drop all observations in the `viol` DataFrame in which the `ticker` is missing.

In [None]:
# 2
viol = viol.dropna(subset=['ticker'])
viol.info()

#### Exercise 4

Run the cell below to re-import the **violations.csv** file into the `viol` DataFrame.

In [None]:
viol = pd.read_csv('violations.csv',parse_dates=['date'],dtype={'zip':str,'penalty':str,'year':str})

1. Convert the `year` column in the `viol` DataFrame to an integer using the `astype()` function.

In [None]:
# 1
viol['year'] = viol['year'].astype(int)

2. Try to convert the `penalty` column in the `viol` DataFrame to a float using the `astype()` function. What is the error you receive?

In [None]:
# 2
try:
    viol['penalty'] = viol['penalty'].astype(float)
except Exception as e:
    print(e)

# The error says "could not convert string to float: '$45,000 '"

3. Create and apply a function to the `penalty` column to convert the `penalty` column to a float.

In [None]:
# 3
def convert_penalty(val):
    val = val.replace('$','').replace(',','').strip()
    val = float(val)
    return val

viol['penalty'] = viol['penalty'].apply(convert_penalty)
print(viol.info())
viol.head()

#### Exercise 5

You are provided a simple DataFrame called `df` that includes names, genders, heights, and marital statuses for four people.

In [None]:
df = pd.DataFrame({'person':['Robby','Karen','Lydia','Greg'], 'gender':['M', 'F', 'F', 'M'], 'height':[5.6, 5.5, 5.3, 6.2], 'marital_status':['married','married','married','single']}) 
df.head()

1. Create a new column called `sentence` with the sentence "Robby is married." for each row, filling in the appropriate name and marital status.

In [None]:
# 1

df['sentence'] = df['person']+' is '+df['marital_status']+'.'
df.head()

2. Create a new column that includes only the first letter of the `marital_status` column. Extra bonus points if you can get that letter to be upper case.

In [None]:
# 2

df['marital_status1'] = df.marital_status.str[0]

# Bonus points!
df['marital_status1'] = df.marital_status.str[0].str.upper()

df.head()

3. Create a new column that multiplies `height` by 100.

In [None]:
# 3
df['height100'] = df['height']*100
df.head()