Importando as bibliotecas necessárias

In [1]:

import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import io
import os


In [2]:
# Caminho para o dataset
dataset_path = '../data/ACC_INTAKES_OUTCOMES.parquet'

# Carregando os dados
data = pd.read_parquet(dataset_path)

# Exibindo as primeiras linhas do dataset para verificar se está tudo certo
data.head()



Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,...,age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
0,10 years,A047759,02/04/2004 00:00,Partner,Transfer,Neutered Male,3650,10.0,"(7.5, 10.0]",07/04/2014 15:12,...,"(7.5, 10.0]",02/04/2014 15:55,4,2014,2014-04,Wednesday,15,1,4 days 23:17:00.000000000,5.0
1,14 years,A197810,21/01/2000 00:00,Partner,Transfer,Spayed Female,5110,14.0,"(12.5, 15.0]",22/12/2014 15:23,...,"(12.5, 15.0]",08/12/2014 12:30,12,2014,2014-12,Monday,12,1,14 days 02:53:00.000000000,14.1
2,16 years,A200922,03/10/1997 00:00,Foster,Adoption,Neutered Male,5840,16.0,"(15.0, 17.5]",22/11/2013 09:44,...,"(15.0, 17.5]",03/10/2013 15:47,10,2013,2013-10,Thursday,15,1,49 days 17:57:00.000000000,49.7
3,14 years,A208755,27/04/2000 00:00,Suffering,Euthanasia,Neutered Male,5110,14.0,"(12.5, 15.0]",17/02/2015 16:25,...,"(12.5, 15.0]",17/02/2015 13:17,2,2015,2015-02,Tuesday,13,1,0 days 03:08:00.000000000,0.1
4,17 years,A210457,01/06/1999 00:00,Foster,Adoption,Neutered Male,6205,17.0,"(15.0, 17.5]",07/10/2016 12:34,...,"(15.0, 17.5]",28/09/2016 12:05,9,2016,2016-09,Wednesday,12,1,9 days 00:29:00.000000000,9.0


In [3]:
# Verificar informações básicas do dataset
data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36346 entries, 0 to 36345
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age_upon_outcome            36346 non-null  object 
 1   animal_id_outcome           36346 non-null  object 
 2   date_of_birth               36346 non-null  object 
 3   outcome_subtype             36346 non-null  object 
 4   outcome_type                36346 non-null  object 
 5   sex_upon_outcome            36346 non-null  object 
 6   age_upon_outcome_(days)     36346 non-null  int64  
 7   age_upon_outcome_(years)    36346 non-null  float64
 8   age_upon_outcome_age_group  36346 non-null  object 
 9   outcome_datetime            36346 non-null  object 
 10  outcome_month               36346 non-null  int64  
 11  outcome_year                36346 non-null  int64  
 12  outcome_monthyear           36346 non-null  object 
 13  outcome_weekday             363

In [4]:
# Exibir estatísticas descritivas do dataset
data.describe(include='all')


Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,...,age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
count,36346,36346,36346,36346,36346,36346,36346.0,36346.0,36346,36346,...,36346,36346,36346.0,36346.0,36346,36346,36346.0,36346.0,36346,36346.0
unique,46,35986,4838,19,6,5,,,10,26037,...,10,25148,,,54,7,,,17156,
top,1 year,A695798,01/05/2016 00:00,Partner,Transfer,Neutered Male,,,"(-0.025, 2.5",18/04/2016 00:00,...,"(-0.025, 2.5",09/07/2014 12:58,,,2015-06,Monday,,,0 days 00:06:00.000000000,
freq,6440,4,74,19840,23799,8349,,,29355,39,...,29437,62,,,1246,5718,,,72,
mean,,,,,,,636.38106,1.745694,,,...,,,6.609696,2015.33481,,,13.43771,1.027568,,19.398798
std,,,,,,,997.432904,2.731384,,,...,,,3.214465,1.279775,,,3.108004,0.189007,,47.659005
min,,,,,,,0.0,0.0,,,...,,,1.0,2013.0,,,0.0,1.0,,0.0
25%,,,,,,,60.0,0.2,,,...,,,4.0,2014.0,,,11.0,1.0,,0.7
50%,,,,,,,330.0,0.9,,,...,,,6.0,2015.0,,,13.0,1.0,,4.1
75%,,,,,,,730.0,2.0,,,...,,,9.0,2016.0,,,16.0,1.0,,16.2


In [6]:
data_cleaned = data.copy()

# Converter colunas de datas e horas para o tipo datetime com formato específico
date_columns = ['date_of_birth', 'outcome_datetime', 'intake_datetime']

for col in date_columns:
    data_cleaned[col] = pd.to_datetime(data_cleaned[col], format='%d/%m/%Y %H:%M', errors='coerce')

# Converter colunas 'monthyear' para o formato mm/yyyy
monthyear_columns = ['dob_monthyear', 'outcome_monthyear', 'intake_monthyear']
for col in monthyear_columns:
    data_cleaned[col] = pd.to_datetime(data_cleaned[col], format='%Y-%m', errors='coerce').dt.strftime('%m/%Y')

# Tratar colunas de mês como números
month_columns = ['outcome_month', 'dob_month', 'intake_month']
for col in month_columns:
    data_cleaned[col] = pd.to_numeric(data_cleaned[col], errors='coerce')

# Converter colunas numéricas para tipos numéricos
numeric_columns = [
    'age_upon_outcome_(days)', 'age_upon_outcome_(years)', 'outcome_year', 'outcome_hour',
    'outcome_number', 'dob_year', 'age_upon_intake_(days)', 'age_upon_intake_(years)',
    'intake_year', 'intake_hour', 'intake_number', 'time_in_shelter_days'
]
for col in numeric_columns:
    data_cleaned[col] = pd.to_numeric(data_cleaned[col], errors='coerce')

# Exibir a nova estrutura do DataFrame após as conversões
data_cleaned.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36346 entries, 0 to 36345
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   age_upon_outcome            36346 non-null  object        
 1   animal_id_outcome           36346 non-null  object        
 2   date_of_birth               36346 non-null  datetime64[ns]
 3   outcome_subtype             36346 non-null  object        
 4   outcome_type                36346 non-null  object        
 5   sex_upon_outcome            36346 non-null  object        
 6   age_upon_outcome_(days)     36346 non-null  int64         
 7   age_upon_outcome_(years)    36346 non-null  float64       
 8   age_upon_outcome_age_group  36346 non-null  object        
 9   outcome_datetime            36346 non-null  datetime64[ns]
 10  outcome_month               36346 non-null  int64         
 11  outcome_year                36346 non-null  int64     

In [7]:
# Verificar valores ausentes
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
missing_values


Series([], dtype: int64)

In [8]:
# Verificar duplicatas
duplicate_count = data.duplicated().sum()
duplicate_count



15

In [9]:
# Remover duplicatas
data_cleaned = data.drop_duplicates()

# Exibir a contagem de registros após a remoção de duplicatas
data_cleaned.shape


(36331, 41)

In [10]:
# Função para identificar outliers
def identify_outliers(data):
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    outlier_columns = []
    for col in numeric_columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        if data[(data[col] < Q1 - outlier_step) | (data[col] > Q3 + outlier_step)].shape[0] > 0:
            outlier_columns.append(col)
    return outlier_columns

# Identificar as colunas com outliers
outlier_columns = identify_outliers(data_cleaned)

# Exibir as colunas que contêm outliers
if outlier_columns:
    print(f"Outliers identificados nas seguintes colunas: {', '.join(outlier_columns)}")
else:
    print("Não foram identificados outliers nas colunas numéricas.")


Outliers identificados nas seguintes colunas: age_upon_outcome_(days), age_upon_outcome_(years), outcome_hour, outcome_number, dob_year, age_upon_intake_(days), age_upon_intake_(years), intake_hour, intake_number, time_in_shelter_days


Como colunas relacionadas à idade e tempo no abrigo são necessárias para a analise de dados deste dataset, serão removidos apenas os outliers relacionados à horário de saída e entrada no abrigo pois são irrelevantes para nosso estudo.

In [14]:
# Função para remover outliers apenas nas colunas especificadas
def remove_outliers(data, columns):
    for col in columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        data = data[~((data[col] < (Q1 - outlier_step)) | (data[col] > (Q3 + outlier_step)))]
    return data

# Colunas das quais removeremos os outliers
columns_to_remove_outliers = ['outcome_hour', 'intake_hour']

# Remover outliers dessas colunas
data_cleaned = remove_outliers(data, columns_to_remove_outliers)


# Exibir o número de registros restantes após a remoção de outliers
print(f"Número de registros restantes após remoção de outliers: {len(data_cleaned)}")


Número de registros restantes após remoção de outliers: 34358


Escalonamento de dados

In [15]:
# Identificar todas as colunas numéricas no DataFrame
numeric_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Exibir as colunas numéricas para revisão
print("Colunas numéricas identificadas:")
print(numeric_columns)

Colunas numéricas identificadas:
Index(['age_upon_outcome_(days)', 'age_upon_outcome_(years)', 'outcome_month',
       'outcome_year', 'outcome_hour', 'outcome_number', 'dob_year',
       'dob_month', 'count', 'age_upon_intake_(days)',
       'age_upon_intake_(years)', 'intake_month', 'intake_year', 'intake_hour',
       'intake_number', 'time_in_shelter_days'],
      dtype='object')


Algumas colunas não serão escalonadas por serem irrelevantes para nosso estudo ou por já possuir outras colunas que cumprirão o proposito.

Neste caso, apenas as seguintes colunas serão escalonadas:

'dob_year' = Ano de nascimento do animal
'age_upon_intake_(days)' = Idade no momento de entrada em dias
'age_upon_intake_(years)' = Idade no momento de entrada em anos 
'intake_month', =  Mês de entrada
'intake_year', = Mês de saída 
'intake_number', = Número de vezes que já entrou no abrigo
'age_upon_outcome_(days)' = Idade no momento de saída em dias
'age_upon_outcome_(years)' = Idade no momento de saída em anos
'outcome_month' = Mês de saída
'outcome_year' = Ano de saída
'outcome_number' = Número de vezes que já saiu do abrigo
'time_in_shelter_days' = Quantidade de dias que passou no abrigo


Padronização dos dados (Standard Scaling):

Transforma os dados para que tenham média zero e desvio padrão igual a um. - É útil para algoritmos que assumem que os dados são distribuídos em uma forma normal (gaussiana).

In [16]:
from sklearn.preprocessing import StandardScaler

# Definir as colunas que precisam de escalonamento padronizado
columns_to_scale = [
'dob_year', 
'age_upon_intake_(days)',
'age_upon_intake_(years)',
'intake_month', 
'intake_year',  
'intake_number', 
'age_upon_outcome_(days)',
'age_upon_outcome_(years)',
'outcome_month',
'outcome_year',
'outcome_number', 
'time_in_shelter_days', 
]

# Inicializar o StandardScaler
scaler = StandardScaler()

# Aplicar o escalonamento padronizado e adicionar as colunas escalonadas ao DataFrame
data_scaled = data_cleaned.copy()  # Criar uma cópia para manter os dados originais
for col in columns_to_scale:
    data_scaled[f'{col}_scaled'] = scaler.fit_transform(data_cleaned[[col]])

# Verificar o resultado
data_scaled.head()


Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,...,age_upon_intake_(years)_scaled,intake_month_scaled,intake_year_scaled,intake_number_scaled,age_upon_outcome_(days)_scaled,age_upon_outcome_(years)_scaled,outcome_month_scaled,outcome_year_scaled,outcome_number_scaled,time_in_shelter_days_scaled
0,10 years,A047759,02/04/2004 00:00,Partner,Transfer,Neutered Male,3650,10.0,"(7.5, 10.0]",07/04/2014 15:12,...,3.03538,-0.813021,-1.009191,-0.145378,3.004984,3.005601,-0.829338,-1.036047,-0.145378,-0.303841
1,14 years,A197810,21/01/2000 00:00,Partner,Transfer,Spayed Female,5110,14.0,"(12.5, 15.0]",22/12/2014 15:23,...,4.499287,1.677259,-1.009191,-0.145378,4.46063,4.461925,1.614592,-1.036047,-0.145378,-0.113195
2,16 years,A200922,03/10/1997 00:00,Foster,Adoption,Neutered Male,5840,16.0,"(15.0, 17.5]",22/11/2013 09:44,...,5.231241,1.054689,-1.788185,-0.145378,5.188454,5.190087,1.309101,-1.809951,-0.145378,0.632628
3,14 years,A208755,27/04/2000 00:00,Suffering,Euthanasia,Neutered Male,5110,14.0,"(12.5, 15.0]",17/02/2015 16:25,...,4.499287,-1.435591,-0.230197,-0.145378,4.46063,4.461925,-1.44032,-0.262143,-0.145378,-0.406496
4,17 years,A210457,01/06/1999 00:00,Foster,Adoption,Neutered Male,6205,17.0,"(15.0, 17.5]",07/10/2016 12:34,...,5.597218,0.743404,0.548796,-0.145378,5.552365,5.554168,1.00361,0.511761,-0.145378,-0.220041


Agora, realizaremos a Normalização dos dados.

Normalização (Min-Max Scaling):
Transforma os dados para que todos os valores estejam dentro do intervalo [0, 1].
É útil para algoritmos que não fazem suposições sobre a distribuição dos dados, como redes neurais e métodos de distância (por exemplo, k-NN).

In [17]:
from sklearn.preprocessing import MinMaxScaler

# Definir as colunas que precisam de normalização
columns_to_normalize = [
'dob_year', 
'age_upon_intake_(days)',
'age_upon_intake_(years)',
'intake_month', 
'intake_year',  
'intake_number', 
'age_upon_outcome_(days)',
'age_upon_outcome_(years)',
'outcome_month',
'outcome_year',
'outcome_number', 
'time_in_shelter_days', 
]

# Inicializar o MinMaxScaler
scaler = MinMaxScaler()

# Aplicar a normalização e adicionar as colunas normalizadas ao DataFrame
data_normalized = data_cleaned.copy()  # Criar uma cópia para manter os dados originais
for col in columns_to_normalize:
    data_normalized[f'{col}_normalized'] = scaler.fit_transform(data_cleaned[[col]])

# Verificar o resultado
data_normalized.head()


Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,...,age_upon_intake_(years)_normalized,intake_month_normalized,intake_year_normalized,intake_number_normalized,age_upon_outcome_(days)_normalized,age_upon_outcome_(years)_normalized,outcome_month_normalized,outcome_year_normalized,outcome_number_normalized,time_in_shelter_days_normalized
0,10 years,A047759,02/04/2004 00:00,Partner,Transfer,Neutered Male,3650,10.0,"(7.5, 10.0]",07/04/2014 15:12,...,0.454545,0.272727,0.2,0.0,0.454545,0.454545,0.272727,0.2,0.0,0.003984
1,14 years,A197810,21/01/2000 00:00,Partner,Transfer,Spayed Female,5110,14.0,"(12.5, 15.0]",22/12/2014 15:23,...,0.636364,1.0,0.2,0.0,0.636364,0.636364,1.0,0.2,0.0,0.011235
2,16 years,A200922,03/10/1997 00:00,Foster,Adoption,Neutered Male,5840,16.0,"(15.0, 17.5]",22/11/2013 09:44,...,0.727273,0.818182,0.0,0.0,0.727273,0.727273,0.909091,0.0,0.0,0.039602
3,14 years,A208755,27/04/2000 00:00,Suffering,Euthanasia,Neutered Male,5110,14.0,"(12.5, 15.0]",17/02/2015 16:25,...,0.636364,0.090909,0.4,0.0,0.636364,0.636364,0.090909,0.4,0.0,8e-05
4,17 years,A210457,01/06/1999 00:00,Foster,Adoption,Neutered Male,6205,17.0,"(15.0, 17.5]",07/10/2016 12:34,...,0.772727,0.727273,0.6,0.0,0.772727,0.772727,0.818182,0.6,0.0,0.007171
