In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

This data set is from the census(https://archive.ics.uci.edu/ml/datasets/Census+Income) listing attributes(age, workclass, education, marital status, occupation, relationship, race, sex, capital gain, capital loss, hours per week and native country). We want to predict if the person makes over 50k a year given these features

In [2]:
#Importing the test and train data
TrainData = pd.read_csv("adult_data.csv") #read in training csv file
TestData = pd.read_csv("adult_data_test.csv") #read in the testing csv file

Printing the first three instances in the test data frame

In [13]:
TestData.head(50)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income-Level
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


Printing the first three instances in the traindata frame

In [4]:
TrainData.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income-Level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


The test data has an extra period in the income column so we get rid of it before combining

In [5]:
TestData = TestData.replace({' >50K.': ' >50K'})
TestData = TestData.replace({' <=50K.': ' <=50K'})

Since age is semi-continuous variable, we need to bin it due to the lack of data points. We describe the numberical data to find the min-max values. We do the same for the hours per week, capital loss and capital gain

In [10]:
def GetRefinedData(Data):  
    AgeBins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
    HoursPerWeekBins = [i*10 for i in range(100)]
    CapGainBins = [-100.+i*1000. for i in range(1000)]
    CapLossBins = [-100.+j*100. for j in range(600)]

    #Binning data
    Data['BinnedAge'] = pd.cut(Data['age'], bins = AgeBins, labels=False)
    Data['BinnedHPW'] = pd.cut(Data['hours-per-week'], bins = HoursPerWeekBins, labels=False)
    Data['BinnedCapGain'] = pd.cut(Data['capital-gain'], bins = CapGainBins, labels=False)
    Data['BinnedCapLoss'] = pd.cut(Data['capital-loss'], bins = CapLossBins, labels=False)

    #Dropping non-binned counterparts
    Data = Data.drop(['age'], axis=1)
    Data = Data.drop(['education'], axis=1) #education already taken into account by education-num
    Data = Data.drop(['hours-per-week'], axis=1) 
    Data = Data.drop(['capital-loss'], axis=1) 
    Data = Data.drop(['capital-gain'], axis=1)
    
    #Dropping the weight column
    Data = Data.drop(['fnlwgt'], axis=1)
    
    #Creating dummy indices. We don't want the network valuing the following in numerical order so we introduce orthogonal
    #one hot encoding
 
    Data = pd.get_dummies(Data, columns = ['sex'], drop_first = True)
    Data = pd.get_dummies(Data, columns = ['workclass'], drop_first = True)
    Data = pd.get_dummies(Data, columns = ['marital-status'], drop_first = True)
    Data = pd.get_dummies(Data, columns = ['occupation'], drop_first = True)
    Data = pd.get_dummies(Data, columns = ['race'], drop_first = True)
    Data = pd.get_dummies(Data, columns = ['native-country'], drop_first = True)
    Data = pd.get_dummies(Data, columns = ['relationship'], drop_first = True)
    Data = pd.get_dummies(Data, columns = ['Income-Level'], drop_first = True)
    
    return Data

#We need to combine the test and train data to make sure each has the same dummy variables since the test and train
#values won't, in general, contain the same information such as country
def CombineTrainTestSplit(Training, Testing):
    NumberOfTraining = Training.size
    Data = pd.concat((Training, Testing), keys=['train','test'])
    RefinedData = GetRefinedData(Data)
    Train=RefinedData.loc['train',:]
    Test=RefinedData.loc['test',:]
    return Train, Test

In [11]:
RefinedTrainData, RefinedTestData = CombineTrainTestSplit(TrainData, TestData)
RefinedTrainData.head(10)

Unnamed: 0,education-num,BinnedAge,BinnedHPW,BinnedCapGain,BinnedCapLoss,sex_ Male,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,Income-Level_ >50K
0,13,2,3,2,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,13,3,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,9,2,3,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
3,7,4,3,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,13,1,3,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
5,14,2,3,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
6,5,3,1,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
7,9,4,4,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
8,14,2,4,14,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,1
9,13,3,3,5,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
