In [None]:
# Primeiro passo è sempre importar ou instalar as bibliotecas necessárias

%pip install tpot

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from tpot import TPOTClassifier



In [None]:
# Utilizaremos a biblioteca do 'pandas' para abrir os arquivos a serem analisados
# Dessa forma teremos uma primeira impressão do tipo de tratamento que deveremos fazer nas planilhas
# Cumprimento de linhas e colunas, variáveis dependentes e independentes, outliers do modelo, informações NOSQL, etc

train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,188767,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
1,64,Private,286732,HS-grad,9,Widowed,Sales,Not-in-family,White,Female,0,0,17,United-States,<=50K
2,29,Private,253801,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,Ecuador,<=50K
3,28,Private,334032,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States,<=50K
4,22,Private,173004,HS-grad,9,Never-married,Machine-op-inspct,Other-relative,Black,Male,0,0,1,United-States,<=50K


In [None]:
train_data.shape

(34189, 15)

In [None]:
# Primeiramente iremos separar as variáveis independentes da variável dependente
# X representa toda a tabela exeto pela última coluna ('income'), enquanto y representa a mesma

X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]

In [None]:
# Iremos verificar se dentro da tabela há alguma célula não preenchida com a seguinte fórmula

X.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


In [None]:
# O mesmo se aplica para a variável y

y.isnull().sum()

0

In [None]:
# Com isso verificamos que a tabela está sequencial, ou seja, não possui atributos nulos
# Mas, ainda assim, pode ser que haja atributos 'sem valor' (?) nos hiperparametros de 'workclass', 'occupation' e 'native-country'

# Substituiremos '?', além de 'Never-worked' e 'Without-pay' devido a baixa espressividade, por 'Outros'

X.loc[X['workclass'] == '?', 'workclass'] = 'Outros'
X.loc[X['workclass'] == 'Never-worked', 'workclass'] = 'Outros'
X.loc[X['workclass'] == 'Without-pay', 'workclass'] = 'Outros'
agrupado_workclass = X.groupby(['workclass']).size()
agrupado_workclass

Unnamed: 0_level_0,0
workclass,Unnamed: 1_level_1
Federal-gov,1009
Local-gov,2194
Outros,1994
Private,23719
Self-emp-inc,1197
Self-emp-not-inc,2687
State-gov,1389


In [None]:
# Substituiremos '?' por 'Outros'

X.loc[X['occupation'] == '?', 'occupation'] = 'Outros'
agrupado_occupation = X.groupby(['occupation']).size()
agrupado_occupation

Unnamed: 0_level_0,0
occupation,Unnamed: 1_level_1
Adm-clerical,3918
Armed-Forces,11
Craft-repair,4282
Exec-managerial,4256
Farming-fishing,1030
Handlers-cleaners,1418
Machine-op-inspct,2137
Other-service,3459
Outros,1977
Priv-house-serv,180


In [None]:
# Substituiremos '?' por 'Outros'

X.loc[X['native-country'] == '?', 'native-country'] = 'Outros'

#for i in X['native-country']:
#  if X.groupby(['native-country']).size()[i] < 500:
#    X.loc[X['native-country'] == i, 'native-country'] = 'Outros'

agrupado_country = X.groupby(['native-country']).size()
agrupado_country

Unnamed: 0_level_0,0
native-country,Unnamed: 1_level_1
Cambodia,25
Canada,131
China,96
Columbia,55
Cuba,86
Dominican-Republic,69
Ecuador,31
El-Salvador,110
England,75
France,32


In [None]:
# Iremos também retirar do modelo a coluna 'educational-num', ja que representam o mesmo valor, em forma numérica, da coluna 'education'

X.drop('educational-num', axis=1, inplace=True)
X

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,188767,HS-grad,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States
1,64,Private,286732,HS-grad,Widowed,Sales,Not-in-family,White,Female,0,0,17,United-States
2,29,Private,253801,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,Ecuador
3,28,Private,334032,Assoc-voc,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
4,22,Private,173004,HS-grad,Never-married,Machine-op-inspct,Other-relative,Black,Male,0,0,1,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34184,36,Private,635913,HS-grad,Married-spouse-absent,Other-service,Not-in-family,Black,Male,0,0,40,United-States
34185,34,Private,107624,Some-college,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
34186,28,Private,250135,Some-college,Divorced,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States
34187,46,State-gov,96652,Assoc-voc,Separated,Adm-clerical,Unmarried,Black,Female,0,0,40,United-States


In [None]:
# Agora iremos transformar todos os atributos categóricos em atributos numéricos utilizando o Label Encoder

labelencoder = LabelEncoder()

for column in X.select_dtypes(include=['object']).columns:
  X[column] = labelencoder.fit_transform(X[column])

X

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,3,188767,11,4,2,3,4,1,0,0,40,39
1,64,3,286732,11,6,12,1,4,0,0,0,17,39
2,29,3,253801,11,2,2,0,4,1,0,0,40,6
3,28,3,334032,8,2,2,0,4,1,0,0,50,39
4,22,3,173004,11,4,6,2,2,1,0,0,1,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34184,36,3,635913,11,3,7,1,2,1,0,0,40,39
34185,34,3,107624,15,2,2,0,4,1,0,0,50,39
34186,28,3,250135,15,0,3,1,4,0,0,0,40,39
34187,46,6,96652,8,5,0,4,2,0,0,0,40,39


In [None]:
# Iremos 'normalizar' os dados da coluna "fnlwgt", pois os mesmo, visualmente, destoam dos demais dados do DataFrame, e assim utilizaremos o StandScaler

sc = StandardScaler()
sc_tr = sc.fit_transform(X.iloc[:,2:3])
sc_tr

array([[-0.00752487],
       [ 0.92050237],
       [ 0.60854539],
       ...,
       [ 0.5738172 ],
       [-0.88013477],
       [-0.12734947]])

In [None]:
# Substituiremos o "fnlwgt" pelos nossos dados ajustados "fnlwgt_ajustado" e excluiremos primeiro do DataFrame

X_ajustado = pd.concat([X, pd.DataFrame(sc_tr, columns=['fnlwgt_ajustado'])], axis=1)
X_ajustado.drop(columns=["fnlwgt"])
X_ajustado.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,fnlwgt_ajustado
0,25,3,188767,11,4,2,3,4,1,0,0,40,39,-0.007525
1,64,3,286732,11,6,12,1,4,0,0,0,17,39,0.920502
2,29,3,253801,11,2,2,0,4,1,0,0,40,6,0.608545
3,28,3,334032,8,2,2,0,4,1,0,0,50,39,1.368578
4,22,3,173004,11,4,6,2,2,1,0,0,1,39,-0.156849


In [None]:
# Tambem iremos transformar a nossa variavel y com Label Encoder

y_encoded = labelencoder.fit_transform(y)
y_encoded

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
# Agora que ja temos todo nosso DataFrame 'estruturado' da melhor forma, iremos treinar e testar os parâmetros

X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X_ajustado,y_encoded,test_size=0.3, random_state=0)

In [None]:
# Com o TPOTClassifier iremos verificar qual seria o melhor modelo de maching learning, com a nossa seleção de dados, para maximizar o output F1_SCORE

tpot = TPOTClassifier(
    generations=10,
    population_size=100,
    offspring_size=100,
    mutation_rate=0.9,
    crossover_rate=0.1,
    scoring='f1',
    max_time_mins=5,
    random_state=0,
    early_stop=True,
    verbosity=2,
    config_dict='TPOT light'
)
tpot.fit(X_treinamento, y_treinamento)
print("Melhor Modelo: ", tpot.fitted_pipeline_)

is_classifier
is_regressor
is_classifier
is_regressor




is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_regressor


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.6363099037194011

5.01 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: DecisionTreeClassifier(ZeroCount(input_matrix), criterion=gini, max_depth=8, min_samples_leaf=7, min_samples_split=14)
Melhor Modelo:  Pipeline(steps=[('zerocount', ZeroCount()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=8, min_samples_leaf=7,
                                        min_samples_split=14,
                                        random_state=0))])


In [None]:
# Agora verificaremos a performance para 'accuracy_score' e 'f1_score', sendo f1_score o principal

previsao = tpot.predict(X_teste)
acuracia = accuracy_score(previsao, y_teste)
f1 = f1_score(previsao, y_teste, average='weighted')
print(f"Acuracia: {acuracia}, F1: {f1}")

Acuracia: 0.8531734425270547, F1: 0.8633980983091533


In [None]:
# Após otimizarmos na medida do possível o tratamento de nosso DataFrame, tambem verificamos, segundo o TPOTClassifier, e de acordo com os parametros estabelecidos por nós,
# verificamos que o possível melhor modelo a ser utilizado é o "DecisionTreeClassifier", o qual utilizaremos a seguir

%pip install graphviz

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz



In [None]:
# Criaremos o modelo de Decision Tree seguindo os parâmetros estabelecidos pelo TPOT

modelo = DecisionTreeClassifier(max_depth=8, min_samples_leaf=7, min_samples_split=14)
modelo.fit(X_treinamento,y_treinamento)

In [None]:
# Podemos tambem "imprimir" uma imagem de como seria a nossa árvore de decisão segundo o modelo

dot_data = export_graphviz(modelo, out_file=None, filled=True, feature_names=train_data.columns[:-1], class_names=True, rounded=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree", format="png")

'decision_tree.png'

In [None]:
# Finalmente, iremos testar nosso modelo com dados específicos de teste e verificaremos os scores de assertividade

previsoes = modelo.predict(X_teste)
accuracy = accuracy_score(y_teste, previsoes)
f1 = f1_score(y_teste, previsoes, average='weighted')
print(f'Acuracia: {accuracy}, F1: {f1}')

Acuracia: 0.8531734425270547, F1: 0.842948786744956
