In [2]:
# Dependencies
import pandas as pd
import numpy as np

In [3]:
# Name of the CSV file
file = 'Resources/donors2008.csv'

In [23]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file, encoding="ISO-8859-1")

In [24]:
# Preview of the DataFrame
# Note that FIELD8 is likely a meaningless column
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1776 entries, 0 to 1775
Data columns (total 8 columns):
LastName     1776 non-null object
FirstName    1776 non-null object
Employer     1743 non-null object
City         1776 non-null object
State        1776 non-null object
Zip          1776 non-null object
Amount       1776 non-null float64
FIELD8       0 non-null float64
dtypes: float64(2), object(6)
memory usage: 111.1+ KB


In [6]:
# Delete extraneous column
del df['FIELD8']
df.head()

Unnamed: 0,LastName,FirstName,Employer,City,State,Zip,Amount
0,Aaron,Eugene,State Department,Dulles,VA,20189,500.0
1,Abadi,Barbara,Abadi & Co.,New York,NY,10021,200.0
2,Adamany,Anthony,Retired,Rockford,IL,61103,500.0
3,Adams,Lorraine,Self,New York,NY,10026,200.0
4,Adams,Marion,,Exeter,NH,3833,100.0


In [7]:
# Identify incomplete rows
df.count()

LastName     1776
FirstName    1776
Employer     1743
City         1776
State        1776
Zip          1776
Amount       1776
dtype: int64

In [8]:
# Drop all rows with missing information
df = df.dropna(how='any')

In [9]:
# Verify dropped rows
df.count()

LastName     1743
FirstName    1743
Employer     1743
City         1743
State        1743
Zip          1743
Amount       1743
dtype: int64

In [9]:
# The Amount column is the wrong data type. It should be numeric.
df.dtypes

LastName      object
FirstName     object
Employer      object
City          object
State         object
Zip           object
Amount       float64
dtype: object

In [10]:
# Use pd.to_numeric() method to convert the datatype of the Amount column
df['Amount'] = pd.to_numeric(df['Amount'])

In [11]:
# Verify that the Amount column datatype has been made numeric
df['Amount'].dtype

dtype('float64')

In [12]:
# Display an overview of the Employers column
df['Employer'].value_counts()

None                        249
Self                        241
Retired                     126
Self Employed                39
Self-Employed                34
                           ... 
Hurtado, S.C.                 1
Casey Ciklin, Et Al           1
Lincolnland Properties        1
Taconic Capital Advisors      1
University Of Oklahoma        1
Name: Employer, Length: 1011, dtype: int64

In [15]:
# Clean up Employer category. Replace 'Self Employed' and 'Self' with 'Self-Employed'
df['Employer'] = df['Employer'].replace(
    {'Self Employed': 'Self-Employed', 'Self': 'Self-Employed'})

In [16]:
# Verify clean-up.
df['Employer'].value_counts()

Self-Employed              314
None                       249
Retired                    126
Google                       6
Not Employed                 4
                          ... 
Juniper Networks/Intuit      1
Hurtado, S.C.                1
Casey Ciklin, Et Al          1
Lincolnland Properties       1
University Of Oklahoma       1
Name: Employer, Length: 1009, dtype: int64

In [17]:
df['Employer'] = df['Employer'].replace({'Not Employed': 'Unemployed'})
df['Employer'].value_counts()

Self-Employed              314
None                       249
Retired                    126
Unemployed                   8
Google                       6
                          ... 
Juniper Networks/Intuit      1
Hurtado, S.C.                1
Casey Ciklin, Et Al          1
Lincolnland Properties       1
University Of Oklahoma       1
Name: Employer, Length: 1008, dtype: int64

In [18]:
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
df.describe()

Unnamed: 0,Amount
count,1743.0
mean,640.12475
std,1242.343265
min,5.0
25%,200.0
50%,250.0
75%,500.0
max,5000.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1743 entries, 0 to 1775
Data columns (total 7 columns):
LastName     1743 non-null object
FirstName    1743 non-null object
Employer     1743 non-null object
City         1743 non-null object
State        1743 non-null object
Zip          1743 non-null object
Amount       1743 non-null float64
dtypes: float64(1), object(6)
memory usage: 108.9+ KB
