In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('adult.csv')

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
# Taking the look at the income
data['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [6]:
# Assign the outcome as 0 if income <=50K and 1 if income >50K
data['income'] = [0 if x == '<=50K' else 1 for x in data['income']]

# Assign X as dataframe of features and y as series of outcome variable
X = data.drop(labels='income',axis=1)
y = data['income']

In [7]:
print (X.shape)
X.head()

(48842, 14)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States


In [8]:
print (y.shape)
y.head()

(48842,)


0    0
1    0
2    1
3    1
4    0
Name: income, dtype: int64

## Data Cleaning

### Dealing with data types
Types of data.
- Numeric eg. income, age
- Categorical eg. gender, nationality
- Ordinal eg. low,med,high

Models can only handle numeric features

We must convert categorical and ordinal features into numeric features
- Creating dummy features
- Transform a categorical feature into a set of dummy features, each representing the unique category
- In the set of dummy features, 1 indicates thta the observation belongs to that category

In [9]:
# Education is a categorical features
print (X['education'].value_counts())

HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64


In [10]:
# Use get_dummies in pandas
# Another option: OneHotEncoder in sci-kit learn
print (pd.get_dummies(X['education']).head(5))

   10th  11th  12th  1st-4th  5th-6th  7th-8th  9th  Assoc-acdm  Assoc-voc  \
0     0     1     0        0        0        0    0           0          0   
1     0     0     0        0        0        0    0           0          0   
2     0     0     0        0        0        0    0           1          0   
3     0     0     0        0        0        0    0           0          0   
4     0     0     0        0        0        0    0           0          0   

   Bachelors  Doctorate  HS-grad  Masters  Preschool  Prof-school  \
0          0          0        0        0          0            0   
1          0          0        1        0          0            0   
2          0          0        0        0          0            0   
3          0          0        0        0          0            0   
4          0          0        0        0          0            0   

   Some-college  
0             0  
1             0  
2             0  
3             1  
4             1  


In [11]:
# Decide which categorical variables we want to use in model
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print ("Feature '{col_name}' has {unique_cat} unique categories".format(
        col_name=col_name,unique_cat=unique_cat))

Feature 'workclass' has 9 unique categories
Feature 'education' has 16 unique categories
Feature 'marital-status' has 7 unique categories
Feature 'occupation' has 15 unique categories
Feature 'relationship' has 6 unique categories
Feature 'race' has 5 unique categories
Feature 'gender' has 2 unique categories
Feature 'native-country' has 42 unique categories


In [12]:
# We can see that although 'native_country' has a lot of unique categories,
# most categories has only have few observations
X['native-country'].value_counts().sort_values(ascending=False).head(5)

United-States    43832
Mexico             951
?                  857
Philippines        295
Germany            206
Name: native-country, dtype: int64

In [13]:
# In this case, bucket the low frequency categories as 'other'
X['native-country'] = ['United-States' if x == 'United-States' else 'Other' for x in X['native-country']]

print (X['native-country'].value_counts().sort_values(ascending=False))

United-States    43832
Other             5010
Name: native-country, dtype: int64


In [14]:
# creating the list of dummy features to dummy
todummy_list = ['workclass','education','maritial_status','occupation','relationship','race','gender','native-country']

In [15]:
# listing the unique values in categorical features
for col_name in X.columns:
    if X[col_name].dtype == 'object':
        unique_count = len(X[col_name].unique())
        print ("Feature '{col_name}' has {unique_count} unique values".format(col_name=col_name,unique_count=unique_count))

Feature 'workclass' has 9 unique values
Feature 'education' has 16 unique values
Feature 'marital-status' has 7 unique values
Feature 'occupation' has 15 unique values
Feature 'relationship' has 6 unique values
Feature 'race' has 5 unique values
Feature 'gender' has 2 unique values
Feature 'native-country' has 2 unique values


In [16]:
# Function to dummy all the categorical variables used for modeling
def dummy_data(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df    

In [17]:
# X = dummy_data(X, todummy_list)
# print (X.head())