# In this workbook we will use the adult dataset and the task of the adult dataset is to predict whether a worker has an income of over  $50,000 or < 50,000

The question of how to represent your data best for a particular application is known as feature engineering

Load data

In [32]:
import pandas as pd
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
data = pd.read_csv(
"C://data//adult.data", header=None, index_col=False,
names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'income'])
data.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


minimize the data to few coloumns

In [22]:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
'occupation', 'income']]
data.head(3)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K


validated the gender data


In [23]:
data.count()

age               32561
workclass         32561
education         32561
gender            32561
hours-per-week    32561
occupation        32561
income            32561
dtype: int64

In [24]:
data.gender.value_counts()

 Male      21790
 Female    10771
Name: gender, dtype: int64

# Apply onehot-encoding (binarize) the data using Pandas get_dummies function

In [25]:
list(data.columns)

['age',
 'workclass',
 'education',
 'gender',
 'hours-per-week',
 'occupation',
 'income']

In [26]:
data_dummies =pd.get_dummies(data)

In [27]:
list(data_dummies.columns)

['age',
 'hours-per-week',
 'workclass_ ?',
 'workclass_ Federal-gov',
 'workclass_ Local-gov',
 'workclass_ Never-worked',
 'workclass_ Private',
 'workclass_ Self-emp-inc',
 'workclass_ Self-emp-not-inc',
 'workclass_ State-gov',
 'workclass_ Without-pay',
 'education_ 10th',
 'education_ 11th',
 'education_ 12th',
 'education_ 1st-4th',
 'education_ 5th-6th',
 'education_ 7th-8th',
 'education_ 9th',
 'education_ Assoc-acdm',
 'education_ Assoc-voc',
 'education_ Bachelors',
 'education_ Doctorate',
 'education_ HS-grad',
 'education_ Masters',
 'education_ Preschool',
 'education_ Prof-school',
 'education_ Some-college',
 'gender_ Female',
 'gender_ Male',
 'occupation_ ?',
 'occupation_ Adm-clerical',
 'occupation_ Armed-Forces',
 'occupation_ Craft-repair',
 'occupation_ Exec-managerial',
 'occupation_ Farming-fishing',
 'occupation_ Handlers-cleaners',
 'occupation_ Machine-op-inspct',
 'occupation_ Other-service',
 'occupation_ Priv-house-serv',
 'occupation_ Prof-specialty',


In [28]:
data_dummies.head(3)

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Extracting actual feature data and label data

For X value we extract only the columns containing features, all columns
from age to occupation_ Transport-moving. This range contains all the features but
not the target

For y value we extract the income_>50 which is the prediction data

In [29]:
features = data_dummies.ix[:, 'age':'occupation_ Transport-moving']
# Extract NumPy arrays
X = features.values
y = data_dummies['income_ >50K'].values
print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

X.shape: (32561, 44) y.shape: (32561,)


# Splitting training and testing data


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()


# Measure accuracy of the Classifier

In [36]:
y_test

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [31]:
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)

0.80874585431765145

In [40]:
X_test

array([[27, 44,  0, ...,  0,  0,  0],
       [27, 40,  0, ...,  0,  0,  0],
       [25, 40,  0, ...,  1,  0,  0],
       ..., 
       [38, 40,  0, ...,  0,  0,  1],
       [63, 50,  0, ...,  0,  0,  0],
       [47, 38,  0, ...,  0,  0,  0]], dtype=int64)

In [38]:
logreg.predict(X_test)

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)