In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Carregar os arquivos CSV
df_treinamento = pd.read_csv('../arquivos_csv/train.csv', low_memory=False)
df_teste = pd.read_csv('../arquivos_csv/test.csv', low_memory=False)

# Verificar valores nulos no arquivo de treino e de teste
valores_nulos_treino = df_treinamento.isnull().sum()
valores_nulos_teste = df_teste.isnull().sum()

# Concatenar valores nulos para visualização lado a lado
valores_nulos_comparacao = pd.concat([valores_nulos_treino, valores_nulos_teste], axis=1)
valores_nulos_comparacao.columns = ['Nulos_Treino', 'Nulos_Teste']

# Exibir os valores nulos lado a lado
print("\nComparação de valores nulos entre treino e teste:")
print(valores_nulos_comparacao)




Comparação de valores nulos entre treino e teste:
                          Nulos_Treino  Nulos_Teste
ID                                   0          0.0
Customer_ID                          0          0.0
Month                                0          0.0
Name                              9985       5015.0
Age                                  0          0.0
SSN                                  0          0.0
Occupation                           0          0.0
Annual_Income                        0          0.0
Monthly_Inhand_Salary            15002       7498.0
Num_Bank_Accounts                    0          0.0
Num_Credit_Card                      0          0.0
Interest_Rate                        0          0.0
Num_of_Loan                          0          0.0
Type_of_Loan                     11408       5704.0
Delay_from_due_date                  0          0.0
Num_of_Delayed_Payment            7002       3498.0
Changed_Credit_Limit                 0          0.0
Num_Credit_In

#### Remoção de Atributos Desnecessários

In [2]:
df_treinamento = df_treinamento.drop(columns=['Name', 'ID', 'Customer_ID', 'SSN', 'Occupation', 'Month'])

print("\nValores nulos no arquivo de treino:")
print(df_treinamento.isnull().sum())


Valores nulos no arquivo de treino:


Age                             0
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                     0
Type_of_Loan                11408
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit            0
Num_Credit_Inquiries         1965
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age           9030
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Payment_Behaviour               0
Monthly_Balance              1200
Credit_Score                    0
dtype: int64


In [3]:
df_teste = df_teste.drop(columns=['Name', 'ID', 'Customer_ID', 'SSN', 'Occupation', 'Month'])

print("\nValores nulos no arquivo de treino:")
print(df_teste.isnull().sum())


Valores nulos no arquivo de treino:
Age                            0
Annual_Income                  0
Monthly_Inhand_Salary       7498
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                5704
Delay_from_due_date            0
Num_of_Delayed_Payment      3498
Changed_Credit_Limit           0
Num_Credit_Inquiries        1035
Credit_Mix                     0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age          4470
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly     2271
Payment_Behaviour              0
Monthly_Balance              562
dtype: int64


#### Removendo idades negativas e acima de 110 anos & valores do tipo string

In [4]:
df_treinamento['Age'] = pd.to_numeric(df_treinamento['Age'], errors='coerce')

df_treinamento['Age'].fillna(0)

df_treinamento.loc[(df_treinamento['Age'] < 0) | (df_treinamento['Age'] > 110), 'Age'] = pd.NA

valores_unicos_age = sorted(df_treinamento['Age'].unique())

print(valores_unicos_age)

[np.float64(23.0), np.float64(nan), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.float64(53.0), np.float64(54.0), np.float64(55.0), np.float64(56.0), np.float64(95.0), np.float64(99.0), np.float64(100.0), np.float64(102.0), np.float64(109.0)]


In [5]:
df_teste['Age'] = pd.to_numeric(df_teste['Age'], errors='coerce')

df_teste['Age'].fillna(0)

df_teste.loc[(df_teste['Age'] < 0) | (df_teste['Age'] > 110), 'Age'] = pd.NA

valores_unicos_age = sorted(df_teste['Age'].unique())

print(valores_unicos_age)

[np.float64(14.0), np.float64(16.0), np.float64(17.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(nan), np.float64(15.0), np.float64(18.0), np.float64(19.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.float64(53.0), np.float64(54.0), np.float64(55.0), np.float64(56.0), np.float64(95.0)]


In [6]:
total_vazios_age = df_treinamento['Age'].isnull().sum()

print(f"Total de instâncias vazias no atributo 'Age': {total_vazios_age}")

Total de instâncias vazias no atributo 'Age': 7628


In [7]:
total_vazios_age = df_teste['Age'].isnull().sum()
print(f"Total de instâncias vazias no atributo 'Age': {total_vazios_age}")

Total de instâncias vazias no atributo 'Age': 3836


### Salvando os dados em treinamento2.csv

In [8]:
df_treinamento.to_csv('../arquivos_csv/treinamento2.csv')
df_teste.to_csv('../arquivos_csv/teste2.csv')
