In [16]:
# Dependencies
import pandas as pd
import numpy as np

In [17]:
# Name of the CSV file
file = 'Resources/donors2008.csv'

In [18]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file, encoding="ISO-8859-1")

In [19]:
# Preview of the DataFrame
# Note that FIELD8 is likely a meaningless column
df.head()

Unnamed: 0,LastName,FirstName,Employer,City,State,Zip,Amount,FIELD8
0,Aaron,Eugene,State Department,Dulles,VA,20189,500.0,
1,Abadi,Barbara,Abadi & Co.,New York,NY,10021,200.0,
2,Adamany,Anthony,Retired,Rockford,IL,61103,500.0,
3,Adams,Lorraine,Self,New York,NY,10026,200.0,
4,Adams,Marion,,Exeter,NH,3833,100.0,


In [20]:
# Delete extraneous column
del df['FIELD8']
df.head()

Unnamed: 0,LastName,FirstName,Employer,City,State,Zip,Amount
0,Aaron,Eugene,State Department,Dulles,VA,20189,500.0
1,Abadi,Barbara,Abadi & Co.,New York,NY,10021,200.0
2,Adamany,Anthony,Retired,Rockford,IL,61103,500.0
3,Adams,Lorraine,Self,New York,NY,10026,200.0
4,Adams,Marion,,Exeter,NH,3833,100.0


In [21]:
# Identify incomplete rows
df.count()

LastName     1776
FirstName    1776
Employer     1743
City         1776
State        1776
Zip          1776
Amount       1776
dtype: int64

In [22]:
# Drop all rows with missing information
df = df.dropna(how='any')

In [23]:
# Verify dropped rows
df.count()

LastName     1743
FirstName    1743
Employer     1743
City         1743
State        1743
Zip          1743
Amount       1743
dtype: int64

In [24]:
# The Amount column is the wrong data type. It should be numeric.
df.dtypes

LastName      object
FirstName     object
Employer      object
City          object
State         object
Zip           object
Amount       float64
dtype: object

In [25]:
# Use pd.to_numeric() method to convert the datatype of the Amount column
df['Zip'] = pd.to_numeric(df['Zip'])

ValueError: Unable to parse string "66402-9001" at position 22

In [None]:
# Verify that the Amount column datatype has been made numeric
df['Amount'].dtype

In [26]:
# Display an overview of the Employers column
df['Employer'].value_counts()

None                             249
Self                             241
Retired                          126
Self Employed                     39
Self-Employed                     34
                                ... 
Fox                                1
Puma Springs Vineyards             1
South Brooklyn Legal Services      1
Greene & Seaver, PC                1
JDS Uniphase                       1
Name: Employer, Length: 1011, dtype: int64

In [27]:
# Clean up Employer category. Replace 'Self Employed' and 'Self' with 'Self-Employed'
df['Employer'] = df['Employer'].replace(
    {'Self Employed': 'Self-Employed', 'Self': 'Self-Employed'})

In [28]:
# Verify clean-up.
df['Employer'].value_counts()

Self-Employed                    314
None                             249
Retired                          126
Google                             6
Not Employed                       4
                                ... 
Fox                                1
Puma Springs Vineyards             1
South Brooklyn Legal Services      1
Greene & Seaver, PC                1
JDS Uniphase                       1
Name: Employer, Length: 1009, dtype: int64

In [None]:
df['Employer'] = df['Employer'].replace({'Not Employed': 'Unemployed', 'None': 'Unemployed'})
df['Employer'].value_counts()

In [None]:
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
df.describe()