Titulo do trabalho e cabeçalho

In [1]:
# Bibliotecas usadas
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, SGDClassifier, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier


In [68]:
# Receber os dados

data = pd.read_csv("data/trabalho1/conjunto_de_treinamento.csv")

In [69]:
# Estudar as colunas não numéricas do dataframe

dados_problematicos = data.select_dtypes(exclude=[int, float, bool])

dados_problematicos.describe()

Unnamed: 0,forma_envio_solicitacao,sexo,estado_onde_nasceu,estado_onde_reside,possui_telefone_residencial,codigo_area_telefone_residencial,possui_telefone_celular,vinculo_formal_com_empresa,estado_onde_trabalha,possui_telefone_trabalho,codigo_area_telefone_trabalho
count,20000,20000,20000,20000,20000,20000.0,20000,20000,20000.0,20000,20000.0
unique,3,4,28,27,2,81.0,1,2,28.0,2,77.0
top,internet,F,BA,SP,Y,,N,N,,N,
freq,11264,12246,2351,3578,16474,3534.0,20000,11174,13573.0,14519,14525.0


In [70]:
def tratar_dados_nao_numericos(df: pd.DataFrame, coluna: str) -> pd.DataFrame:
    """ Transforma os dados em uma estrutura numérica. """

    # Verifico se os dados são binários e evito colinearidade para maior acurácia
    if len(df[coluna].unique()) == 2:
        df_coluna_tratada = pd.get_dummies(df[coluna], prefix=coluna, drop_first=True)
    else:
        df_coluna_tratada = pd.get_dummies(df[coluna], prefix=coluna)

    # axis = 1 seleciona que as operações sejam feitas nas colunas
    return pd.concat([df.drop(coluna, axis=1), df_coluna_tratada], axis=1)    

In [71]:
# Aplicar One-Hot Encoding para todas e usar drop_first em colunas binárias para evitar colinearidade

for coluna in dados_problematicos.columns:
    data = tratar_dados_nao_numericos(data, coluna)

In [72]:
# Verificar se restam dados problemáticos

data.select_dtypes(exclude=[int, float, bool]).empty

True

In [73]:
# Verificar porcentagem de NaN por coluna

nan_data = data.isna().mean() * 100
nan_data = nan_data[nan_data > 0]

# Mais dados problemáticos
nan_data

tipo_residencia                2.680
meses_na_residencia            7.250
profissao                     15.485
ocupacao                      14.890
profissao_companheiro         57.570
grau_instrucao_companheiro    64.300
dtype: float64

In [74]:
# Analisar os dados problematicos no dataframe original
data[nan_data.index].describe()

Unnamed: 0,tipo_residencia,meses_na_residencia,profissao,ocupacao,profissao_companheiro,grau_instrucao_companheiro
count,19464.0,18550.0,16903.0,17022.0,8486.0,7140.0
mean,1.261303,9.572453,8.045081,2.53331,3.708107,0.288095
std,0.88358,10.64958,3.21079,1.532765,5.181241,0.944339
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,9.0,1.0,0.0,0.0
50%,1.0,6.0,9.0,2.0,0.0,0.0
75%,1.0,15.0,9.0,4.0,11.0,0.0
max,5.0,228.0,17.0,5.0,17.0,5.0


In [75]:
data[nan_data.index].nunique()

#Para dados categóricos, preencher com nova categoria representando informação desconhecida
# Ja funciona assim com 0

#Para dados não categóricos, preencher com a média dos valores da coluna

data["meses_na_residencia"].fillna(data["meses_na_residencia"].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["meses_na_residencia"].fillna(data["meses_na_residencia"].mean(), inplace=True)


In [None]:
# Normalizar os dados
scaler = RobustScaler()

data_normalizada = scaler.fit_transform(data)

data = pd.DataFrame(data_normalizada, columns=data.columns, index=data.index)
data.describe()

Unnamed: 0,id_solicitante,produto_solicitado,dia_vencimento,tipo_endereco,idade,estado_civil,qtde_dependentes,grau_instrucao,nacionalidade,tipo_residencia,...,codigo_area_telefone_trabalho_81,codigo_area_telefone_trabalho_84,codigo_area_telefone_trabalho_85,codigo_area_telefone_trabalho_86,codigo_area_telefone_trabalho_9,codigo_area_telefone_trabalho_90,codigo_area_telefone_trabalho_91,codigo_area_telefone_trabalho_94,codigo_area_telefone_trabalho_96,codigo_area_telefone_trabalho_97
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,19464.0,...,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,2.2737370000000003e-17,0.28435,0.314725,0.00635,0.112024,0.12085,0.6664,0.0,-0.0384,0.261303,...,0.0098,0.00655,0.0018,0.00485,0.0016,0.001,0.00085,0.0001,5e-05,0.0132
std,0.5773936,1.008239,0.674851,0.079435,0.710961,1.332004,1.236725,0.0,0.202305,0.88358,...,0.098511,0.080669,0.042389,0.069475,0.039969,0.031608,0.029143,0.01,0.007071,0.114133
min,-1.0,0.0,-0.9,0.0,-1.571429,-2.0,0.0,0.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.5,0.0,0.0,0.0,-0.428571,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5,0.0,1.0,0.0,0.571429,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,6.0,1.5,1.0,3.142857,5.0,53.0,0.0,1.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Testar Varios parametros para o SGDClassifier

sgd_options = []
sgd_loss = ["hinge", "log_loss", "squared_hinge", "perceptron", "squared_error"]
sgd_shuffle = [True, False]
sgd_learning_rate = ["constant", "optimal", "invscaling", "adaptive"]

for loss in sgd_loss:
    for penalty in sgd_penalty:
        for intercept in sgd_intercept:
            for shuffle in sgd_shuffle:
                for learning_rate in sgd_learning_rate:
                    sgd_options.append(SGDClassifier(
                                            loss=loss, 
                                            penalty=penalty,
                                            fit_intercept=intercept,
                                            shuffle=shuffle,
                                            learning_rate=learning_rate))

# 160 tunning options


320

In [None]:
# Funções para o KNNClassifier

def inv_ranking(distancias):
    asw = []
    for i in range(distancias):
        asw.append(1/(i+1))
    return asw

def ranking(distancias):
    asw = []
    for i in range(distancias):
        asw.append(len(distancias)-i)
    return asw

In [None]:
# Testar varios parâmetros para o KNNClassifier

knn_options = []

knn_n_neighbours = []
n = 2
while(n < len(data)):
    knn_n_neighbours.append(n)
    n*=2

knn_wheighs = ["uniform", "distance", ranking, inv_ranking] # Talvez adicionar mais depois
metrics = ["minkowski", "cityblock", "cosine", "euclidean", "haversine", "l1", "l2", "manhattan", "nan_euclidean"]

for n_neighbours in knn_n_neighbours:
    for wheigh in knn_wheighs:
        for metric in metrics:
            knn_options.append(KNeighborsClassifier(
                n_neighbors=n_neighbours,
                weights=wheigh,
                metric=metric
            ))

# 504 Tunning options

504

In [None]:
# Testar LinearRegression e Logistic Regression

LinearRegression()
LogisticRegressionCV()