In [3]:
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects

In [4]:
# load dataset
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)
x_cols = [c for c in df.columns if c != 'income']
# set input matrix and target column
X = df[x_cols]
y = df['income']
# show first rows of data
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [7]:
df.workclass.unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', nan, 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [8]:
df.education.unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [18]:
df.rename(columns={'marital-status': 'maritalstatus', 'native-country': 'nativecountry'}, inplace=True)

df.maritalstatus.unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [10]:
df.occupation.unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', nan, 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

In [11]:
df.relationship.unique()

array(['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)

In [12]:
df.race.unique()

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)

In [13]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [19]:
df.nativecountry.unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', nan, 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [15]:
df.income.unique()

array(['<=50K', '>50K'], dtype=object)

## Cjto. Entrenamiento y Prueba (30%)

In [20]:
# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [21]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
29700,37,Private,34146,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,68,United-States
1529,37,Private,26898,HS-grad,9,Divorced,Exec-managerial,Unmarried,White,Female,0,0,12,United-States
27477,26,Private,190762,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,18,United-States
31950,35,Private,189092,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States
4732,23,Private,260019,7th-8th,4,Never-married,Farming-fishing,Unmarried,Other,Male,0,0,36,Mexico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27852,24,Private,223367,11th,7,Never-married,Craft-repair,Unmarried,White,Male,0,0,40,United-States
23605,20,Private,127185,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,15,United-States
1318,41,State-gov,144928,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States
25299,40,Private,199303,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States


In [23]:
y_train

29700    <=50K
1529     <=50K
27477    <=50K
31950     >50K
4732     <=50K
         ...  
27852    <=50K
23605    <=50K
1318      >50K
25299    <=50K
27439     >50K
Name: income, Length: 22792, dtype: object

In [27]:
X_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
1055,32,Private,87643,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
26305,27,Private,207352,Bachelors,13,Married-civ-spouse,Tech-support,Husband,Asian-Pac-Islander,Male,0,0,40,India
9568,73,State-gov,74040,7th-8th,4,Divorced,Other-service,Not-in-family,Asian-Pac-Islander,Female,0,0,40,United-States
15071,39,Private,174924,HS-grad,9,Separated,Exec-managerial,Not-in-family,White,Male,14344,0,40,United-States
2012,31,Private,264936,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19433,59,,367984,Bachelors,13,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States
12825,26,Private,144483,Assoc-voc,11,Divorced,Sales,Own-child,White,Female,594,0,35,United-States
23420,55,Private,98361,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,55,United-States
6259,24,Local-gov,150084,Some-college,10,Separated,Protective-serv,Not-in-family,White,Male,0,0,60,United-States


In [28]:
y_test

1055     <=50K
26305     >50K
9568     <=50K
15071     >50K
2012     <=50K
         ...  
19433    <=50K
12825    <=50K
23420     >50K
6259     <=50K
1439      >50K
Name: income, Length: 9769, dtype: object

## Preprocesamiento

In [29]:
# fill missing values
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print(train_mode)

{'age': 31.0, 'workclass': 'Private', 'fnlwgt': 121124, 'education': 'HS-grad', 'education-num': 9.0, 'marital-status': 'Married-civ-spouse', 'occupation': 'Prof-specialty', 'relationship': 'Husband', 'race': 'White', 'sex': 'Male', 'capital-gain': 0.0, 'capital-loss': 0.0, 'hours-per-week': 40.0, 'native-country': 'United-States'}


In [30]:
# convert categoricals
encoders = {}
for column in ['workclass', 'education', 'marital-status',
                'occupation', 'relationship', 'race',
                'sex','native-country']:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert

In [32]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22792 entries, 29700 to 27439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             22792 non-null  int64
 1   workclass       22792 non-null  int32
 2   fnlwgt          22792 non-null  int64
 3   education       22792 non-null  int32
 4   education-num   22792 non-null  int64
 5   marital-status  22792 non-null  int32
 6   occupation      22792 non-null  int32
 7   relationship    22792 non-null  int32
 8   race            22792 non-null  int32
 9   sex             22792 non-null  int32
 10  capital-gain    22792 non-null  int64
 11  capital-loss    22792 non-null  int64
 12  hours-per-week  22792 non-null  int64
 13  native-country  22792 non-null  int32
dtypes: int32(8), int64(6)
memory usage: 1.9 MB


In [33]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 1055 to 1439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9199 non-null   object
 2   fnlwgt          9769 non-null   int64 
 3   education       9769 non-null   object
 4   education-num   9769 non-null   int64 
 5   marital-status  9769 non-null   object
 6   occupation      9196 non-null   object
 7   relationship    9769 non-null   object
 8   race            9769 non-null   object
 9   sex             9769 non-null   object
 10  capital-gain    9769 non-null   int64 
 11  capital-loss    9769 non-null   int64 
 12  hours-per-week  9769 non-null   int64 
 13  native-country  9588 non-null   object
dtypes: int64(6), object(8)
memory usage: 1.1+ MB


## Entrar algoritmo rainforest (clasificador de bosque aleatorio)

In [34]:
# train the Random Forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [35]:
# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

In [36]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']