In [1]:
# Import libraries 
import pandas as pd
from pathlib import Path

In [2]:
# read adult_data.csv file into dataframe called adult_df
file = Path("adult_data.csv")
adult_df = pd.read_csv(file)
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# drop the following columns from the original csv fiile: ' fnlwgt', ' capital-gain', ' capital-loss'
adult_df.drop(columns=[' fnlwgt', ' capital-gain', ' capital-loss'], inplace=True)
adult_df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [4]:
# Rename the column names
columns = ['Age', 'Workclass', 'Education', 'Education-num', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Hours-per-week', 'Native-country', 'Salary']
adult_df.columns = columns
adult_df.head()

Unnamed: 0,Age,Workclass,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Hours-per-week,Native-country,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [5]:
# Check if there are any null values and drop them if they exist
adult_df.isnull().sum()

Age               0
Workclass         0
Education         0
Education-num     0
Marital-status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Hours-per-week    0
Native-country    0
Salary            0
dtype: int64

In [6]:
# Check total number of duplicates 
adult_df.duplicated().sum()

4243

In [7]:
# Drop duplicates and check; below code should return '0' if there are no more duplicates
adult_df = adult_df.drop_duplicates().copy()
adult_df.duplicated().sum()

0

In [8]:
# Drop applicants that did not fill out parts of their application (drop where there is a ' ?') - add explanation in read me file
for column_name in adult_df:
    if any(adult_df[(adult_df[column_name] == ' ?')].index) != []:
        applicants = adult_df[(adult_df[column_name] == ' ?')].index
        adult_df.drop(applicants, inplace = True)

# The total number of unique applicants who have filled out the application completely can be found by running the code below
adult_df.count()

  res_values = method(rvalues)


Age               26142
Workclass         26142
Education         26142
Education-num     26142
Marital-status    26142
Occupation        26142
Relationship      26142
Race              26142
Sex               26142
Hours-per-week    26142
Native-country    26142
Salary            26142
dtype: int64

In [11]:
# Clean identified fields that have a " " before the entry
for column_name in adult_df:
    if adult_df.dtypes[column_name] == object:
        adult_df[column_name] = adult_df[column_name].str.replace(" ", "")


Unnamed: 0,Age,Workclass,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Hours-per-week,Native-country,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [12]:
# create a new dataframe by sorting applicants by 'Age'
adult_age_df = adult_df.sort_values('Age')
adult_age_df

Unnamed: 0,Age,Workclass,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Hours-per-week,Native-country,Salary
26807,17,Private,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,20,United-States,<=50K
21196,17,Private,10th,6,Never-married,Other-service,Own-child,White,Female,14,United-States,<=50K
31131,17,Private,11th,7,Never-married,Sales,Own-child,White,Male,5,United-States,<=50K
26793,17,Private,10th,6,Never-married,Sales,Other-relative,White,Male,30,United-States,<=50K
14785,17,Private,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,15,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
18725,90,Local-gov,HS-grad,9,Married-civ-spouse,Other-service,Husband,White,Male,40,United-States,<=50K
4070,90,Private,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,40,United-States,<=50K
6624,90,Private,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,<=50K
2303,90,Private,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,35,United-States,<=50K


In [13]:
# Create a new dataframe by sorting applicants by 'Education-num'
adult_edu_num_df = adult_df.sort_values('Education-num')
adult_edu_num_df

Unnamed: 0,Age,Workclass,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Hours-per-week,Native-country,Salary
2884,71,Private,Preschool,1,Widowed,Craft-repair,Unmarried,Black,Male,10,United-States,<=50K
21370,61,Private,Preschool,1,Married-spouse-absent,Other-service,Not-in-family,Asian-Pac-Islander,Male,40,China,<=50K
32432,36,Private,Preschool,1,Divorced,Other-service,Not-in-family,Other,Male,72,Mexico,<=50K
7894,26,Private,Preschool,1,Married-spouse-absent,Machine-op-inspct,Not-in-family,White,Male,40,Mexico,<=50K
6864,54,Private,Preschool,1,Married-civ-spouse,Farming-fishing,Husband,White,Male,60,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
4805,51,Federal-gov,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Female,70,United-States,>50K
25831,31,Private,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,99,United-States,>50K
28082,33,State-gov,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,50,United-States,>50K
21430,52,Local-gov,Doctorate,16,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,>50K


In [14]:
# Create a new dataframe by sorting applicants by 'Hours-per-week'
adult_hpw_df = adult_df.sort_values('Hours-per-week')
adult_hpw_df

Unnamed: 0,Age,Workclass,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Hours-per-week,Native-country,Salary
11451,27,Private,HS-grad,9,Never-married,Machine-op-inspct,Other-relative,White,Male,1,United-States,<=50K
25078,74,Private,10th,6,Divorced,Other-service,Not-in-family,White,Female,1,United-States,<=50K
19750,23,Private,HS-grad,9,Never-married,Craft-repair,Own-child,Asian-Pac-Islander,Male,1,Vietnam,<=50K
24284,57,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,1,United-States,<=50K
22960,21,Private,HS-grad,9,Never-married,Machine-op-inspct,Own-child,Black,Male,1,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
12788,24,State-gov,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Female,99,England,<=50K
12625,51,Private,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,99,South,<=50K
23830,45,Self-emp-not-inc,Some-college,10,Married-civ-spouse,Other-service,Wife,White,Female,99,United-States,<=50K
15356,90,Private,HS-grad,9,Widowed,Transport-moving,Unmarried,White,Male,99,United-States,<=50K


In [16]:
# Save cleaned adult_df dataframe to a new csv called "applicants"
adult_df.to_csv('applicants.csv', index=False)