In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import RandomizedSearchCV

# 1. Análise de dados

In [2]:
# carregamos o dataset
df = pd.read_csv("../data/adult.csv")

In [3]:
# overview do dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
df.shape

(48842, 15)

In [5]:
# tem NA values, precisaremos decidir o que faremos com estes NA
# provavelmente podemos converter estes NA para one-hot-encoding
df['workclass'].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [6]:
# nao tem NA values
df['education'].value_counts()

HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64

In [7]:
# nao ha NA values
df['marital-status'].value_counts()

Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: marital-status, dtype: int64

In [8]:
# tem NA values ("?")
df['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [9]:
# nao ha NA values
df['relationship'].value_counts()

Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: relationship, dtype: int64

In [10]:
# nao ha NA values
df['race'].value_counts()

White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: race, dtype: int64

In [11]:
# income
df['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [12]:
# anos de estudo
df['educational-num'].value_counts()

9     15784
10    10878
13     8025
14     2657
11     2061
7      1812
12     1601
6      1389
4       955
15      834
5       756
8       657
16      594
3       509
2       247
1        83
Name: educational-num, dtype: int64

In [13]:
# os valores NAN estao representados como "?" neste df
# desta forma podemos observar quantos NAN tem em cada coluna
df[df == '?'].count()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [14]:
# número de rows e colunas do dataset
df.shape

(48842, 15)

In [15]:
# com os histogramas podemos fazer um EDA dos dados numericos
df.hist(bins=50, figsize=(20, 20));

# 2. Limpeza dos dados

## 1.1 transformacao get_dummies das variaveis categoricas

In [16]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [17]:
# vamos definir as variaveis que serao tratadas como categoricas para aplicar a transformacao get_dummies
cat_cols = ['workclass', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 
            'race', 'gender', 'native-country', 'income']

In [18]:
def transformacao_get_dummies(df=df, cat_cols=cat_cols):
    '''
    esta funcao aplica a transformacao get_dummies nas variaveis categoricas
    do dataset df
    '''
    # aplicamos a transformação get_dummies nas variaveis categoricas
    for cat_col in cat_cols:
        df = pd.get_dummies(df, columns=[cat_col], dummy_na=False, prefix="{}_".format(cat_col))
    # vamos remover esta coluna, pois ja esta explicita na coluna "income__>50K"
    df.pop('income__<=50K')
    return df

In [19]:
df = transformacao_get_dummies(df=df, cat_cols=cat_cols)

In [20]:
# após a transformação get_dummies, aumentou o número de colunas para 125
df.shape

(48842, 124)

In [21]:
df.head()

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass__?,workclass__Federal-gov,workclass__Local-gov,workclass__Never-worked,workclass__Private,...,native-country__Puerto-Rico,native-country__Scotland,native-country__South,native-country__Taiwan,native-country__Thailand,native-country__Trinadad&Tobago,native-country__United-States,native-country__Vietnam,native-country__Yugoslavia,income__>50K
0,25,226802,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,38,89814,0,0,50,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,28,336951,0,0,40,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,44,160323,7688,0,40,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,18,103497,0,0,30,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# 3. Modelos de predição

### Random forest classifier

In [22]:
# dividimos o df em X e y
y = df['income__>50K'].astype('category')
X = df.iloc[:, :-1]

In [23]:
# separamos o dataframe em train e test
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0, shuffle=True)

In [24]:
# modelo random forest de classificação
rf = RandomForestClassifier()
rf.fit(train_X, train_y)

RandomForestClassifier()

In [25]:
pred_rf = rf.predict(val_X)
pred_rf

array([0, 1, 0, ..., 1, 1, 0], dtype=uint64)

In [26]:
# acurácia do rf
accuracy_rf = accuracy_score(val_y, pred_rf)
print('Accuracy do modelo random forest: ', accuracy_rf)

Accuracy do modelo random forest:  0.8537384325608058


In [27]:
# métricas do tuned model rf
print(classification_report(val_y,pred_rf))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      9273
           1       0.73      0.63      0.67      2938

    accuracy                           0.85     12211
   macro avg       0.81      0.78      0.79     12211
weighted avg       0.85      0.85      0.85     12211



In [None]:
# scores do modelo random forest em cross validation
scores_rf = cross_val_score(rf, X, y, cv=5)
scores_rf

In [None]:
# acurácia do rf em cross validation
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_rf.mean(), scores_rf.std()))

### Random forest tunned 

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 4)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 4)]
max_depth.append(None)

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_t = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_tuned = RandomizedSearchCV(estimator = rf_t, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=10, random_state=42, n_jobs = -1)
# Fit the random search model
rf_tuned.fit(train_X, train_y)

In [None]:
# predições do tuned model rf
tuned_preds = rf_tuned.predict(val_X)

In [None]:
# acurácia do tuned model rf
accuracy_tuned = accuracy_score(val_y, tuned_preds)
print('Accuracy do modelo random forest com grid search: ', accuracy_tuned)

In [None]:
# métricas do tuned model rf
print(classification_report(val_y,tuned_preds))

In [None]:
# parámetros escolhidos no grid search
rf_tuned.best_params_

### Logistic regression classifier 

In [None]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(train_X, train_y)  # apply scaling on training data

In [None]:
pipe.score(val_X, val_y)  # apply scaling on testing data, without leaking training data.

In [None]:
# predições do modelo lr
lr_preds = pipe.predict(val_X)

In [None]:
# acurácia do modelo lr
accuracy_lr = accuracy_score(val_y, lr_preds)
print('Accuracy do modelo logistic regression: ', accuracy_lr)

In [None]:
# métricas do modelo logistic regression
print(classification_report(val_y,lr_preds))

## Comparação dos modelos

In [None]:
print('Accuracy do modelo random forest: ', accuracy_rf)
print('Accuracy do modelo random forest com grid search: ', accuracy_tuned)
print('Accuracy do modelo logistic regression: ', accuracy_lr)

A acurácia dos três modelos foi bem parecida. Entretanto,  modelo com otimização de hiperparámetros mostrou a maior acurácia