In [3]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
import joblib 
import json

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)
x_cols = [c for c in df.columns if c!= "income"]
X = df[x_cols]
y = df["income"]

In [8]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [11]:
print("X_train.shape = {}, X_test.shape = {}, y_train.shape = {}, y_test.shape = {}".format(X_train.shape, X_test.shape, y_train.shape, y_test.shape))

X_train.shape = (22792, 14), X_test.shape = (9769, 14), y_train.shape = (22792,), y_test.shape = (9769,)


In [15]:
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
train_mode 

{'age': 36,
 'workclass': 'Private',
 'fnlwgt': 164190,
 'education': 'HS-grad',
 'education-num': 9,
 'marital-status': 'Married-civ-spouse',
 'occupation': 'Prof-specialty',
 'relationship': 'Husband',
 'race': 'White',
 'sex': 'Male',
 'capital-gain': 0,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': 'United-States'}

In [17]:
encoders = {}
for column in ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert 
    


In [19]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
et = ExtraTreesClassifier(n_estimators=100)
et.fit(X_train, y_train)


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [20]:
# save preprocessing objects and ML algorithms 
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./randomforest.joblib", compress=True)
joblib.dump(et, "extratreesmodel.joblib", compress=True)

['extratreesmodel.joblib']