Load Dataset

In [1]:
import pandas as pd
import numpy as np

dataframe = pd.read_csv("adult.data")

Check DF dimensions

In [2]:
dataframe.shape

(32560, 15)

Name columns and preprocess some of the data inside the DF

Preprocessing Key:

age: continuous.
<br>
workclass: Private : 0 , Self-emp-not-inc : 1 , Self-emp-inc : 2 , Federal-gov : 3, Local-gov : 4, State-gov : 5, Without-pay : 6, Never-worked : 7.
<br>
fnlwgt: continuous.
<br>
education: Bachelors : 0, Some-college : 1, 11th : 2, HS-grad : 3, Prof-school : 4, Assoc-acdm : 5, Assoc-voc : 6, 9th : 7, 7th-8th : 8, 12th : 9, Masters : 10, 1st-4th : 11, 10th : 12, Doctorate : 13, 5th-6th : 14, Preschool : 15.
<br>
education-num: continuous.
<br>
marital-status: Married-civ-spouse : 0, Divorced : 1, Never-married : 2, Separated : 3, Widowed : 4, Married-spouse-absent : 5, Married-AF-spouse : 6.
<br>
occupation: Tech-support : 0, Craft-repair : 1, Other-service : 2, Sales : 3, Exec-managerial : 4, Prof-specialty : 5, Handlers-cleaners : 6, Machine-op-inspct : 7, Adm-clerical : 8, Farming-fishing : 9, Transport-moving : 10, Priv-house-serv : 11, Protective-serv : 12, Armed-Forces : 13.
<br>
relationship: Wife : 0 , Own-child : 1 , Husband : 2, Not-in-family : 3, Other-relative : 4, Unmarried : 5.
<br>
race: White : 0, Asian-Pac-Islander : 1, Amer-Indian-Eskimo : 2, Other : 3, Black : 4.
<br>
sex: Female : 0, Male : 1.
<br>
capital-gain: continuous.
<br>
capital-loss: continuous.
<br>
hours-per-week: continuous.
<br>
native-country: United-States : 0, Cambodia : 1, England : 2, Puerto-Rico : 3, Canada : 4, Germany : 5, Outlying-US(Guam-USVI-etc) : 6, India : 7, Japan : 8, Greece : 9, South : 10, China : 11, Cuba : 12, Iran : 13, Honduras : 14, Philippines : 15, Italy : 16, Poland : 17, Jamaica : 18, Vietnam : 19, Mexico : 20, Portugal : 21, Ireland : 22, France : 23, Dominican-Republic : 24, Laos : 25, Ecuador : 26, Taiwan : 27, Haiti : 28, Columbia : 29, Hungary : 30, Guatemala : 31, Nicaragua : 32, Scotland : 33, Thailand : 34, Yugoslavia : 35, El-Salvador : 36, Trinadad&Tobago : 37, Peru : 38, Hong : 39, Holand-Netherlands : 40.
<br>

In [3]:
colnames = [
    "Age",
    "Workclass",
    "Fnlwgt",
    "Education",
    "Education Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Race",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours Per Week",
    "Native Country",
    "Income"
]

dataframe.columns = colnames

workclass_repl = {
    " Private" : 0, 
    " Self-emp-not-inc" : 1, 
    " Self-emp-inc" : 2,
    " Federal-gov" : 3, 
    " Local-gov" : 4,
    " State-gov" : 5,
    " Without-pay" : 6,
    " Never-worked" : 7,
    " ?" : np.nan
}

dataframe["Workclass"] = [workclass_repl[item] for item in dataframe["Workclass"]]

education_repl = { " Bachelors" : 0, " Some-college" : 1, " 11th" : 2, " HS-grad" : 3, " Prof-school" : 4, " Assoc-acdm" : 5, " Assoc-voc" : 6, " 9th" : 7, " 7th-8th" : 8, " 12th" : 9, " Masters" : 10, " 1st-4th" : 11, " 10th" : 12, " Doctorate" : 13, " 5th-6th" : 14, " Preschool" : 15 }
dataframe["Education"] = [education_repl[item] for item in dataframe["Education"]]

marital_repl = {" Married-civ-spouse" : 0, " Divorced" : 1, " Never-married" : 2, " Separated" : 3, " Widowed" : 4, " Married-spouse-absent" : 5, " Married-AF-spouse" : 6 }
dataframe["Marital Status"] = [marital_repl[item] for item in dataframe["Marital Status"]]

occupation_repl = { " ?" : np.nan, " Tech-support" : 0, " Craft-repair" : 1, " Other-service" : 2, " Sales" : 3, " Exec-managerial" : 4, " Prof-specialty" : 5, " Handlers-cleaners" : 6, " Machine-op-inspct" : 7, " Adm-clerical" : 8, " Farming-fishing" : 9, " Transport-moving" : 10, " Priv-house-serv" : 11, " Protective-serv" : 12, " Armed-Forces" : 13 }
dataframe["Occupation"] = [occupation_repl[item] for item in dataframe["Occupation"]]

relationship_repl = { " ?" : np.nan, " Wife" : 0 , " Own-child" : 1 , " Husband" : 2, " Not-in-family" : 3, " Other-relative" : 4, " Unmarried" : 5 }
dataframe["Relationship"] = [relationship_repl[item] for item in dataframe["Relationship"]]

race_repl = { " ?" : np.nan, " White" : 0, " Asian-Pac-Islander" : 1, " Amer-Indian-Eskimo" : 2, " Other" : 3, " Black" : 4 }
dataframe["Race"] = [race_repl[item] for item in dataframe["Race"]]

sex_repl = { " ?" : np.nan, " Female" : 0, " Male" : 1 }
dataframe["Sex"] = [sex_repl[item] for item in dataframe["Sex"]]

country_repl = { " ?" : np.nan, " United-States" : 0, " Cambodia" : 1, " England" : 2, " Puerto-Rico" : 3, " Canada" : 4, " Germany" : 5, " Outlying-US(Guam-USVI-etc)" : 6, " India" : 7, " Japan" : 8, " Greece" : 9, " South" : 10, " China" : 11, " Cuba" : 12, " Iran" : 13, " Honduras" : 14, " Philippines" : 15, " Italy" : 16, " Poland" : 17, " Jamaica" : 18, " Vietnam" : 19, " Mexico" : 20, " Portugal" : 21, " Ireland" : 22, " France" : 23, " Dominican-Republic" : 24, " Laos" : 25, " Ecuador" : 26, " Taiwan" : 27, " Haiti" : 28, " Columbia" : 29, " Hungary" : 30, " Guatemala" : 31, " Nicaragua" : 32, " Scotland" : 33, " Thailand" : 34, " Yugoslavia" : 35, " El-Salvador" : 36, " Trinadad&Tobago" : 37, " Peru" : 38, " Hong" : 39, " Holand-Netherlands" : 40 }
dataframe["Native Country"] = [country_repl[item] for item in dataframe["Native Country"]]

income_repl = { " ?" : np.nan, " <=50K" : 0, " >50K" : 1 }
dataframe["Income"] = [income_repl[item] for item in dataframe["Income"]]

#First drop NA values
dataframe = dataframe.dropna()
dataframe

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours Per Week,Native Country,Income
0,50,1.0,83311,0,13,0,4.0,2,0,1,0,0,13,0.0,0
1,38,0.0,215646,3,9,1,6.0,3,0,1,0,0,40,0.0,0
2,53,0.0,234721,2,7,0,6.0,2,4,1,0,0,40,0.0,0
3,28,0.0,338409,0,13,0,5.0,0,4,0,0,0,40,12.0,0
4,37,0.0,284582,10,14,0,4.0,0,0,0,0,0,40,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,0.0,257302,5,12,0,0.0,0,0,0,0,0,38,0.0,0
32556,40,0.0,154374,3,9,0,7.0,2,0,1,0,0,40,0.0,1
32557,58,0.0,151910,3,9,4,8.0,5,0,0,0,0,40,0.0,0
32558,22,0.0,201490,3,9,2,8.0,1,0,1,0,0,20,0.0,0


Generate Test/Train split

In [4]:
size = dataframe.values.shape[1] - 1

#attributes not including prediction label
X = dataframe.values[:, :size]
#prediction label
Y = dataframe.values[:, size:]

x_train = X[:20000, :] #20,000/10,161 split
x_test  = X[20000:, :]
y_train = Y[:20000, :]
y_test  = Y[20000:, :]

y_train = y_train.ravel()
y_test  = y_test.ravel()

1. A method to select rows in the dataset where a feature we specify equals a value we pass to the function.
2. A method to compute error given a set of predicted labels and a set of true labels

In [20]:
def split_on_val_eq(X_test, Y_test, column, val):
    ## TYPE ANSWER HERE 
    X0_test = []
    X1_test = []
    Y0_test = []
    Y1_test = []
    
    for ind in range(0, X_test.shape[0]):
        if X_test[ind][column] == val:
            X0_test.append(X_test[ind])
            Y0_test.append(Y_test[ind])
        else:
            X1_test.append(X_test[ind])
            Y1_test.append(Y_test[ind])
    
    return (X0_test, X1_test, Y0_test, Y1_test)

def error(y, y_hat):
    count = 0
    total_instances = len(y)
    
    def check(elem):
        nonlocal count
        cond = elem != y_hat[count]
        count += 1
        return cond
    
    total_error = len(list(filter(check, y)))
    
    return total_error / total_instances

## INPUTS:
# y - true labels
# y_hat - predicted labels
def fp_error(y, y_hat):
    fp_errors = [np.maximum(y_hat[i] - y[i], 0) for i in range(len(y))]
    return np.mean(fp_errors)

## INPUTS:
# y - true labels
# y_hat - predicted labels
def fn_error(y, y_hat):
    fn_errors = [np.maximum(y[i] - y_hat[i], 0) for i in range(len(y))]
    return np.mean(fn_errors)

Split on rows where sex = male/female and compute error

In [29]:
from sklearn import linear_model, tree

#dt = tree.DecisionTreeClassifier(max_depth=5)
logreg = linear_model.LogisticRegression(solver='liblinear')

model = logreg.fit(x_train, y_train)

(femaleX, _, femaleY, _) = split_on_val_eq(x_test, y_test, 9, 1)
(maleX, _, maleY, _) = split_on_val_eq(x_test, y_test, 9, 0)

pred_male = model.predict(maleX)
pred_female = model.predict(femaleX)

err_male = error(maleY, pred_male)
err_female = error(femaleY, pred_female)

print("Total female: " + str(len(femaleX)))
print("Total male: " + str(len(maleX)))

print("Error when sex = male: " + str(err_male))
print("Error when sex = female: " + str(err_female))

Total female: 6870
Total male: 3291
Error when sex = male: 0.11030082041932543
Error when sex = female: 0.2576419213973799


Above shows that there is more than twice the amount of error when predicting income of female individuals. Our dataset is showing an inherent bias toward females?