In [1]:
# Primeiro passo è sempre importar ou instalar as bibliotecas necessárias

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Utilizaremos a biblioteca do 'pandas' para abrir os arquivos a serem analisados
# Dessa forma teremos uma primeira impressão do tipo de tratamento que deveremos fazer nas planilhas
# Cumprimento de linhas e colunas, variáveis dependentes e independentes, outliers do modelo, informações NOSQL, etc

validation_data = pd.read_csv('validation.csv')
validation_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,22,Private,174043,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,50,United-States,<=50K
1,50,State-gov,159219,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,Canada,>50K
2,18,?,192399,Some-college,10,Never-married,?,Own-child,White,Male,0,0,60,United-States,<=50K
3,33,Federal-gov,193246,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,42,United-States,>50K
4,23,Private,61777,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,2580,0,40,United-States,<=50K


In [3]:
validation_data.shape

(7326, 15)

In [4]:
# Primeiramente iremos separar as variáveis independentes da variável dependente
# X representa toda a tabela exeto pela última coluna ('income'), enquanto y representa a mesma

X = validation_data.iloc[:,:-1]
y = validation_data.iloc[:,-1]

In [5]:
# Iremos verificar se dentro da tabela há alguma célula não preenchida com a seguinte fórmula

X.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
dtype: int64

In [6]:
# O mesmo se aplica para a variável y

y.isnull().sum()

np.int64(0)

In [7]:
# Com isso verificamos que a tabela está sequencial, ou seja, não possui atributos nulos
# Mas, ainda assim, pode ser que haja atributos 'sem valor' (?) nos hiperparametros de 'workclass', 'occupation' e 'native-country'

# Substituiremos '?', além de 'Never-worked' e 'Without-pay' devido a baixa espressividade, por 'Outros'

X.loc[X['workclass'] == '?', 'workclass'] = 'Outros'
X.loc[X['workclass'] == 'Never-worked', 'workclass'] = 'Outros'
X.loc[X['workclass'] == 'Without-pay', 'workclass'] = 'Outros'
agrupado_workclass = X.groupby(['workclass']).size()
agrupado_workclass

workclass
Federal-gov          211
Local-gov            469
Outros               407
Private             5084
Self-emp-inc         250
Self-emp-not-inc     591
State-gov            314
dtype: int64

In [8]:
# Substituiremos '?' por 'Outros'

X.loc[X['occupation'] == '?', 'occupation'] = 'Outros'
agrupado_occupation = X.groupby(['occupation']).size()
agrupado_occupation

occupation
Adm-clerical         880
Armed-Forces           2
Craft-repair         901
Exec-managerial      920
Farming-fishing      226
Handlers-cleaners    341
Machine-op-inspct    472
Other-service        709
Outros               405
Priv-house-serv       30
Prof-specialty       926
Protective-serv      148
Sales                771
Tech-support         252
Transport-moving     343
dtype: int64

In [9]:
# Substituiremos '?' por 'Outros'

X.loc[X['native-country'] == '?', 'native-country'] = 'Outros'

#for i in X['native-country']:
#  if X.groupby(['native-country']).size()[i] < 500:
#    X.loc[X['native-country'] == i, 'native-country'] = 'Outros'

agrupado_country = X.groupby(['native-country']).size()
agrupado_country

native-country
Cambodia                         2
Canada                          21
China                           13
Columbia                        11
Cuba                            30
Dominican-Republic              21
Ecuador                         10
El-Salvador                     27
England                         28
France                           5
Germany                         37
Greece                           6
Guatemala                       15
Haiti                            9
Honduras                         5
Hong                             6
Hungary                          3
India                           22
Iran                             4
Ireland                          7
Italy                           13
Jamaica                         14
Japan                           15
Laos                             4
Mexico                         139
Nicaragua                        4
Outlying-US(Guam-USVI-etc)       5
Outros                         138
Peru 

In [10]:
# Iremos também retirar do modelo a coluna 'educational-num', ja que representam o mesmo valor, em forma numérica, da coluna 'education'

X.drop('educational-num', axis=1, inplace=True)
X

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,22,Private,174043,HS-grad,Never-married,Craft-repair,Not-in-family,White,Male,0,0,50,United-States
1,50,State-gov,159219,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,Canada
2,18,Outros,192399,Some-college,Never-married,Outros,Own-child,White,Male,0,0,60,United-States
3,33,Federal-gov,193246,Bachelors,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,42,United-States
4,23,Private,61777,Bachelors,Married-civ-spouse,Sales,Husband,White,Male,2580,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7321,47,Federal-gov,211123,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
7322,51,Local-gov,123011,Masters,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1977,35,United-States
7323,21,Private,195919,10th,Never-married,Handlers-cleaners,Not-in-family,Other,Male,0,0,40,Dominican-Republic
7324,56,Federal-gov,255386,Some-college,Married-civ-spouse,Adm-clerical,Husband,Asian-Pac-Islander,Male,0,0,40,Laos


In [11]:
# Agora iremos transformar todos os atributos categóricos em atributos numéricos utilizando o Label Encoder

labelencoder = LabelEncoder()

for column in X.select_dtypes(include=['object']).columns:
  X[column] = labelencoder.fit_transform(X[column])

X

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,22,3,174043,11,4,2,1,4,1,0,0,50,38
1,50,6,159219,9,2,3,0,4,1,0,0,40,1
2,18,2,192399,15,4,8,3,4,1,0,0,60,38
3,33,0,193246,9,2,0,0,4,1,0,0,42,38
4,23,3,61777,9,2,12,0,4,1,2580,0,40,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7321,47,0,211123,11,2,2,0,4,1,0,0,40,38
7322,51,1,123011,12,2,10,0,4,1,0,1977,35,38
7323,21,3,195919,0,4,5,1,3,1,0,0,40,5
7324,56,0,255386,15,2,0,0,1,1,0,0,40,23


In [12]:
# Iremos 'normalizar' os dados da coluna "fnlwgt", pois os mesmo, visualmente, destoam dos demais dados do DataFrame, e assim utilizaremos o StandScaler

sc = StandardScaler()
sc_tr = sc.fit_transform(X.iloc[:,2:3])
sc_tr

array([[-0.15050503],
       [-0.28918113],
       [ 0.02121235],
       ...,
       [ 0.05414138],
       [ 0.61044546],
       [-1.00712744]])

In [13]:
# Substituiremos o "fnlwgt" pelos nossos dados ajustados "fnlwgt_ajustado" e excluiremos primeiro do DataFrame

X_ajustado = pd.concat([X, pd.DataFrame(sc_tr, columns=['fnlwgt_ajustado'])], axis=1)
X_ajustado.drop(columns=["fnlwgt"])
X_ajustado.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,fnlwgt_ajustado
0,22,3,174043,11,4,2,1,4,1,0,0,50,38,-0.150505
1,50,6,159219,9,2,3,0,4,1,0,0,40,1,-0.289181
2,18,2,192399,15,4,8,3,4,1,0,0,60,38,0.021212
3,33,0,193246,9,2,0,0,4,1,0,0,42,38,0.029136
4,23,3,61777,9,2,12,0,4,1,2580,0,40,38,-1.200735


In [14]:
# Tambem iremos transformar a nossa variavel y com Label Encoder

y_encoded = labelencoder.fit_transform(y)
y_encoded

array([0, 1, 0, ..., 0, 0, 0])

In [15]:
# Agora que ja temos todo nosso DataFrame 'estruturado' da melhor forma, iremos treinar e testar os parâmetros

X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X_ajustado,y_encoded,test_size=0.3, random_state=0)

In [20]:
# Após otimizarmos na medida do possível o tratamento de nosso DataFrame, tambem verificamos, segundo o TPOTClassifier, e de acordo com os parametros estabelecidos por nós,
# verificamos que o possível melhor modelo a ser utilizado é o "DecisionTreeClassifier", o qual utilizaremos a seguir

%pip install graphviz

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz

Note: you may need to restart the kernel to use updated packages.


In [17]:
# Criaremos o modelo de Decision Tree seguindo os parâmetros estabelecidos pelo TPOT

modelo = DecisionTreeClassifier(max_depth=8, min_samples_leaf=7, min_samples_split=14)
modelo.fit(X_treinamento,y_treinamento)

In [21]:
# Podemos tambem "imprimir" uma imagem de como seria a nossa árvore de decisão segundo o modelo

dot_data = export_graphviz(modelo, out_file=None, filled=True, feature_names=validation_data.columns[:-1], class_names=True, rounded=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree", format="png")

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

In [19]:
# Finalmente, iremos testar nosso modelo com dados específicos de teste e verificaremos os scores de assertividade

previsoes = modelo.predict(X_teste)
accuracy = accuracy_score(y_teste, previsoes)
f1 = f1_score(y_teste, previsoes, average='weighted')
print(f'Acuracia: {accuracy}, F1: {f1}')

Acuracia: 0.8589626933575978, F1: 0.8513378861805607
