In [1]:
import numpy as np
import pandas as pd

#### Columns
age: continuous. <br>
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. <br>
fnlwgt: continuous. <br>
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. <br>
education-num: continuous. <br>
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. <br>
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. <br>
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. <br>
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. <br>
sex: Female, Male. <br>
capital-gain: continuous. <br>
capital-loss: continuous. <br>
hours-per-week: continuous. <br>
native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.


In [2]:
def setColumnNames(df):
    colList = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'income']
    df.columns = colList
    return df

In [3]:
def getData(path):
    # Training data
    data = pd.read_csv(path + 'adult.data', header = None)
    data = setColumnNames(data)
    
    # Testing data
    test = pd.read_csv(path + 'adult.test', skiprows = 1, header = None)
    test = setColumnNames(test)
    
    return data, test
    

In [4]:
trainData, testData = getData(path = '../Data/')

In [5]:
trainData.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
testData.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [7]:
m, k = trainData.shape

In [8]:
print("Number of training data points = ", m)
print("Number of features = ", k - 1)

Number of training data points =  32561
Number of features =  14


In [9]:
def countIsNa(df, colList):
    for col in colList:
        na = df[col].isna().sum()
        print(col, na)

In [10]:
colList = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'income']
trainData['age'].isna().sum()
countIsNa(trainData, colList)

age 0
workclass 0
fnlwgt 0
education 0
educationNum 0
maritalStatus 0
occupation 0
relationship 0
race 0
sex 0
capitalGain 0
capitalLoss 0
hoursPerWeek 0
nativeCountry 0
income 0


There is no NaN entry in the dataset. 

#### Exploring data - Removing Nan

In [11]:
categoricalColList = ['workclass', 'education', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'nativeCountry']

In [12]:
trainData.loc[trainData['occupation'] == " ?"]

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,income
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,?,293936,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
69,25,?,200681,Some-college,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
106,17,?,304873,10th,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32531,30,?,33811,Bachelors,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32541,41,?,202822,HS-grad,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [13]:
trainData = trainData.loc[trainData['workclass'] != " ?"]

In [14]:
trainData.shape

(30725, 15)

In [15]:
trainData.workclass.unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' Self-emp-inc', ' Without-pay', ' Never-worked'],
      dtype=object)

In [16]:
def removeNAN(df, colList):
    
    testDataWithNan = pd.DataFrame(columns = df.columns)
    
    for col in colList:
        subset = df[df[col] == " ?"]
        testDataWithNan = testDataWithNan.append(subset)
#         print(subset.shape, testDataWithNan.shape)
        df.drop(subset.index, inplace = True)
    return testDataWithNan
    

In [17]:
testDataWithNan = removeNAN(trainData, colList)

In [18]:
removeNAN(testData, colList)

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,income
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
13,58,?,299831,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,35,United-States,<=50K
22,72,?,132015,7th-8th,4,Divorced,?,Not-in-family,White,Female,0,0,6,United-States,<=50K
35,65,?,191846,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15684,46,Private,31411,11th,7,Married-civ-spouse,Other-service,Wife,White,Female,0,0,40,?,<=50K
15711,24,Private,127159,Some-college,10,Never-married,Other-service,Other-relative,White,Female,0,0,24,?,<=50K
16043,24,State-gov,161783,Bachelors,13,Never-married,Transport-moving,Not-in-family,Black,Male,0,0,40,?,<=50K
16090,34,Private,143776,Masters,14,Never-married,Prof-specialty,Not-in-family,Black,Male,0,0,45,?,>50K


In [19]:
total = 0
for category in categoricalColList:
    print("\nCategory = ", category)
    l = trainData[category].value_counts()
    print(l)
    print("Total categories = ", l.count())
    total += l.count()
print("total categories = ", total)


Category =  workclass
 Private             22286
 Self-emp-not-inc     2499
 Local-gov            2067
 State-gov            1279
 Self-emp-inc         1074
 Federal-gov           943
 Without-pay            14
Name: workclass, dtype: int64
Total categories =  7

Category =  education
 HS-grad         9840
 Some-college    6678
 Bachelors       5044
 Masters         1627
 Assoc-voc       1307
 11th            1048
 Assoc-acdm      1008
 10th             820
 7th-8th          557
 Prof-school      542
 9th              455
 12th             377
 Doctorate        375
 5th-6th          288
 1st-4th          151
 Preschool         45
Name: education, dtype: int64
Total categories =  16

Category =  maritalStatus
 Married-civ-spouse       14065
 Never-married             9726
 Divorced                  4214
 Separated                  939
 Widowed                    827
 Married-spouse-absent      370
 Married-AF-spouse           21
Name: maritalStatus, dtype: int64
Total categories =  7



In [20]:
total = 0
for category in categoricalColList:
    print("\nCategory = ", category)
    l = testData[category].value_counts()
    print(l)
    print("Total categories = ", l.count())
    total += l.count()
print("total categories = ", total)


Category =  workclass
 Private             11021
 Self-emp-not-inc     1297
 Local-gov            1033
 State-gov             667
 Self-emp-inc          572
 Federal-gov           463
 Without-pay             7
Name: workclass, dtype: int64
Total categories =  7

Category =  education
 HS-grad         4943
 Some-college    3221
 Bachelors       2526
 Masters          887
 Assoc-voc        652
 11th             571
 Assoc-acdm       499
 10th             403
 7th-8th          266
 Prof-school      243
 9th              221
 12th             200
 Doctorate        169
 5th-6th          161
 1st-4th           71
 Preschool         27
Name: education, dtype: int64
Total categories =  16

Category =  maritalStatus
 Married-civ-spouse       6990
 Never-married            4872
 Divorced                 2083
 Separated                 472
 Widowed                   450
 Married-spouse-absent     182
 Married-AF-spouse          11
Name: maritalStatus, dtype: int64
Total categories =  7

Categor

In [26]:
np.max(trainData["capitalGain"])


99999

Germany is missing in test data - need to add feature manually.

#### One hot encoding

In [21]:
df = trainData
y = pd.get_dummies(df.sex, prefix='Is')
# df = df.append(y)
print(df.head())
y.shape

   age          workclass  fnlwgt   education  educationNum  \
0   39          State-gov   77516   Bachelors            13   
1   50   Self-emp-not-inc   83311   Bachelors            13   
2   38            Private  215646     HS-grad             9   
3   53            Private  234721        11th             7   
4   28            Private  338409   Bachelors            13   

         maritalStatus          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capitalGain  capitalLoss  hoursPerWeek   nativeCountry  income  
0         2174            0            40   United-States   <=50K  
1         

(30162, 2)

In [22]:
df.append(y)

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,income,Is_ Female,Is_ Male
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,,
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,,
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,,
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,,
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,,,,,,,,,,,,,,,,1.0,0.0
32557,,,,,,,,,,,,,,,,0.0,1.0
32558,,,,,,,,,,,,,,,,1.0,0.0
32559,,,,,,,,,,,,,,,,0.0,1.0


In [23]:
pd.concat([df, y], axis=1)

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,income,Is_ Female,Is_ Male
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,1,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,1,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,0,1


In [24]:
def oneHotEncoding(df):

    for column in categoricalColList:
        # print(c)
        y = pd.get_dummies(df[column], prefix='Is_' + column)
        df = pd.concat([df, y], axis = 1)
        df.drop(column, axis = 1, inplace = True)
        # print(df.head())
    return df

In [25]:
trainData = oneHotEncoding(trainData)
testData = oneHotEncoding(testData)
testData.shape

(15060, 104)

In [26]:
for col in trainData.columns:
    if col not in testData.columns:
        print(col)
        testData[col] = 0
testData.shape

Is_nativeCountry_ Holand-Netherlands


(15060, 105)

In [27]:
trainData.head()

Unnamed: 0,age,fnlwgt,educationNum,capitalGain,capitalLoss,hoursPerWeek,income,Is_workclass_ Federal-gov,Is_workclass_ Local-gov,Is_workclass_ Private,...,Is_nativeCountry_ Portugal,Is_nativeCountry_ Puerto-Rico,Is_nativeCountry_ Scotland,Is_nativeCountry_ South,Is_nativeCountry_ Taiwan,Is_nativeCountry_ Thailand,Is_nativeCountry_ Trinadad&Tobago,Is_nativeCountry_ United-States,Is_nativeCountry_ Vietnam,Is_nativeCountry_ Yugoslavia
0,39,77516,13,2174,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [28]:
xTrain, yTrain = trainData.drop("income", axis = 1), trainData['income']
xTest, yTest = testData.drop("income", axis = 1), testData['income']

In [29]:
xTrain.head()

Unnamed: 0,age,fnlwgt,educationNum,capitalGain,capitalLoss,hoursPerWeek,Is_workclass_ Federal-gov,Is_workclass_ Local-gov,Is_workclass_ Private,Is_workclass_ Self-emp-inc,...,Is_nativeCountry_ Portugal,Is_nativeCountry_ Puerto-Rico,Is_nativeCountry_ Scotland,Is_nativeCountry_ South,Is_nativeCountry_ Taiwan,Is_nativeCountry_ Thailand,Is_nativeCountry_ Trinadad&Tobago,Is_nativeCountry_ United-States,Is_nativeCountry_ Vietnam,Is_nativeCountry_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
continuousColList = [x for x in colList if x not in categoricalColList]
continuousColList.remove('income')
continuousColList

['age', 'fnlwgt', 'educationNum', 'capitalGain', 'capitalLoss', 'hoursPerWeek']

In [108]:
df = xTrain
m, k = df.shape
nMissingMax = 3

featureMissing = [[1] * 14]* m
isFeatureReal = pd.DataFrame(1, columns = df.columns, index = df.index)
# isReal = df.replace(df, 0)

# print(isFeatureReal)
# print(df.iloc[14])
for idx in range(m):
#     print(idx)
    row = df.iloc[idx]
#     print(row.to_string())
    features = row[row!=0]

    for column in continuousColList:
        if column not in features.keys():
            features[column] = row[column]
#     print(features)
    
    n = len(features)
    nMissing = np.random.randint(0, nMissingMax + 1)
    x = np.random.choice(features.keys(), nMissing, replace = False)
#     print(x)
    
    for col in x:
        if col in continuousColList:
            isFeatureReal[col].iloc[idx] = 0
            row[col] = 0
        else:
#             print(col.split('_'))
            substr = col.split('_')[1]
#             print(row.filter(like= substr))
            colsCategory = row.filter(like= substr)
            for c in colsCategory.keys():
                isFeatureReal[c].iloc[idx] = 0
                row[c] = 0
            
#     print(row.to_string())
    df.iloc[idx] = row
#     print(isFeatureReal.loc[idx].to_string())
#     break

df = df.to_numpy()
print(df)
print(df.shape)

isFeatureReal = isFeatureReal.to_numpy()
print(isFeatureReal)

print(isFeatureReal.shape)

print("Total missing = ", np.sum(isFeatureReal==0))
# print(df.loc[idx].to_numpy())


[[     0      0      0 ...      0      0      0]
 [     0      0      0 ...      0      0      0]
 [     0      0      0 ...      1      0      0]
 ...
 [    58      0      9 ...      1      0      0]
 [    22 201490      0 ...      1      0      0]
 [     0 287927      9 ...      1      0      0]]
(30162, 104)
[[0 0 1 ... 1 1 1]
 [0 0 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 0 1 ... 1 1 1]
 [1 1 0 ... 1 1 1]
 [0 1 1 ... 1 1 1]]
(30162, 104)
Total missing =  338814


In [30]:
trainData.head()

Unnamed: 0,age,fnlwgt,educationNum,capitalGain,capitalLoss,hoursPerWeek,income,Is_workclass_ Federal-gov,Is_workclass_ Local-gov,Is_workclass_ Private,...,Is_nativeCountry_ Portugal,Is_nativeCountry_ Puerto-Rico,Is_nativeCountry_ Scotland,Is_nativeCountry_ South,Is_nativeCountry_ Taiwan,Is_nativeCountry_ Thailand,Is_nativeCountry_ Trinadad&Tobago,Is_nativeCountry_ United-States,Is_nativeCountry_ Vietnam,Is_nativeCountry_ Yugoslavia
0,39,77516,13,2174,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [54]:
import re
categoryGroupingIdx = []
for categoryColumn in categoricalColList:
    idxList = []
    print(categoryColumn)
    r = re.compile("Is_.*" + categoryColumn)
    for idx, columnName in enumerate(list(trainData.columns)):
        if r.match(columnName):
            print(columnName, idx)
            idxList.append(idx)
    categoryGroupingIdx.append(idxList)
            
    
#     break

workclass
Is_workclass_ Federal-gov 7
Is_workclass_ Local-gov 8
Is_workclass_ Private 9
Is_workclass_ Self-emp-inc 10
Is_workclass_ Self-emp-not-inc 11
Is_workclass_ State-gov 12
Is_workclass_ Without-pay 13
education
Is_education_ 10th 14
Is_education_ 11th 15
Is_education_ 12th 16
Is_education_ 1st-4th 17
Is_education_ 5th-6th 18
Is_education_ 7th-8th 19
Is_education_ 9th 20
Is_education_ Assoc-acdm 21
Is_education_ Assoc-voc 22
Is_education_ Bachelors 23
Is_education_ Doctorate 24
Is_education_ HS-grad 25
Is_education_ Masters 26
Is_education_ Preschool 27
Is_education_ Prof-school 28
Is_education_ Some-college 29
maritalStatus
Is_maritalStatus_ Divorced 30
Is_maritalStatus_ Married-AF-spouse 31
Is_maritalStatus_ Married-civ-spouse 32
Is_maritalStatus_ Married-spouse-absent 33
Is_maritalStatus_ Never-married 34
Is_maritalStatus_ Separated 35
Is_maritalStatus_ Widowed 36
occupation
Is_occupation_ Adm-clerical 37
Is_occupation_ Armed-Forces 38
Is_occupation_ Craft-repair 39
Is_occupat

In [55]:
categoryGroupingIdx

[[7, 8, 9, 10, 11, 12, 13],
 [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
 [30, 31, 32, 33, 34, 35, 36],
 [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
 [51, 52, 53, 54, 55, 56],
 [57, 58, 59, 60, 61],
 [62, 63],
 [64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104]]

In [42]:
list(trainData.columns)

['age',
 'fnlwgt',
 'educationNum',
 'capitalGain',
 'capitalLoss',
 'hoursPerWeek',
 'income',
 'Is_workclass_ Federal-gov',
 'Is_workclass_ Local-gov',
 'Is_workclass_ Private',
 'Is_workclass_ Self-emp-inc',
 'Is_workclass_ Self-emp-not-inc',
 'Is_workclass_ State-gov',
 'Is_workclass_ Without-pay',
 'Is_education_ 10th',
 'Is_education_ 11th',
 'Is_education_ 12th',
 'Is_education_ 1st-4th',
 'Is_education_ 5th-6th',
 'Is_education_ 7th-8th',
 'Is_education_ 9th',
 'Is_education_ Assoc-acdm',
 'Is_education_ Assoc-voc',
 'Is_education_ Bachelors',
 'Is_education_ Doctorate',
 'Is_education_ HS-grad',
 'Is_education_ Masters',
 'Is_education_ Preschool',
 'Is_education_ Prof-school',
 'Is_education_ Some-college',
 'Is_maritalStatus_ Divorced',
 'Is_maritalStatus_ Married-AF-spouse',
 'Is_maritalStatus_ Married-civ-spouse',
 'Is_maritalStatus_ Married-spouse-absent',
 'Is_maritalStatus_ Never-married',
 'Is_maritalStatus_ Separated',
 'Is_maritalStatus_ Widowed',
 'Is_occupation_ Ad