# Adult Income Data Set:

In [22]:
import json # for preprocessing
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for Training Model
from sklearn.ensemble import ExtraTreesClassifier # For training model
from sklearn.metrics import accuracy_score
import joblib # for saving trained model


## Loading Dataset:

In [8]:
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)

In [9]:
# Drop Result Label 
x_cols = [col for col in df.columns if col != 'income']

# Set input matrix and target column
X = df[x_cols]
y = df['income']

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 420)

## Data PreProcessing:

In [11]:
# Fill missing Data, as RandomForest in sklearn package cant handle missing values
train_mode = dict(X_train.mode().iloc[0]) # Mode of each column
X_train = X_train.fillna(train_mode) # Replace Nan's with mode
train_mode

{'age': 36,
 'workclass': 'Private',
 'fnlwgt': 123011,
 'education': 'HS-grad',
 'education-num': 9,
 'marital-status': 'Married-civ-spouse',
 'occupation': 'Craft-repair',
 'relationship': 'Husband',
 'race': 'White',
 'sex': 'Male',
 'capital-gain': 0,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': 'United-States'}

In [12]:
# Convert categorials such as workclass : 'private'
encoders = {}
for col in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']:
    categoriacal_convert = LabelEncoder() # Encode target labels with value between 0 and n_classes-1.
    X_train[col] = categoriacal_convert.fit_transform(X_train[col])
    encoders[col] = categoriacal_convert

## Algorithms Training:

In [13]:
%%time
# train the random forest model
rf = RandomForestClassifier(n_estimators=1000) # n_estimators is number of trees
rf = rf.fit(X_train, y_train)

Wall time: 27.5 s


In [14]:
%%time
# train the extra tree model
et = ExtraTreesClassifier(n_estimators=1000)
et = et.fit(X_train, y_train)

Wall time: 24.5 s


In [15]:
%%time
# Save Preprocessing and trained model artifacts
joblib.dump(train_mode, './train_mode.joblib', compress=True)
joblib.dump(encoders, './encoders.joblib', compress=True)
joblib.dump(rf, './random_forest.joblib', compress=True)
joblib.dump(et, './extra_trees.joblib', compress=True)


Wall time: 26 s


['./extra_trees.joblib']

## Prediction:

In [18]:
print(X_test.shape)
print(y_test.shape)

test_mode = dict(X_test.mode().iloc[0]) # Mode of each column
X_test = X_test.fillna(test_mode) # Replace Nan's with mode

for column, encoder in encoders.items():
    X_test[column] = encoder.transform(X_test[column])

pred = rf.predict(X_test)

(6513, 14)
(6513,)


In [21]:
accuracy_score(y_test, pred)

0.8550591125441425