In [52]:
import mysql.connector

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [53]:
try:
    # Ruta al archivo CSV
    ruta_csv = r"/Users/barbarajunqueira/ProjecteData/Equip_16/Data/banca_06.10_dataset.csv"

    # Cargar el CSV en un DataFrame
    df_bank = pd.read_csv(ruta_csv, sep=',', encoding='utf-8')  # usa sep="," si el separador es coma

    print("✅ CSV cargado correctamente.")
    print(df_bank.head())

except Exception as e:
    print(f"❌ Error al trabajar con el CSV: {e}")


✅ CSV cargado correctamente.
   id   age         job  marital  education default  balance housing loan  \
0   1  59.0      admin.  married  secondary      no     2343     yes   no   
1   2  56.0      admin.  married  secondary      no       45      no   no   
2   3  41.0  technician  married  secondary      no     1270     yes   no   
3   4  55.0    services  married  secondary      no     2476     yes   no   
4   5  54.0      admin.  married   tertiary      no      184      no   no   

   contact  ...  campaign pdays  previous  poutcome  deposit  Clase de edad  \
0  unknown  ...         1    -1         0   unknown      yes          55-64   
1  unknown  ...         1    -1         0   unknown      yes          55-64   
2  unknown  ...         1    -1         0   unknown      yes          35-44   
3  unknown  ...         1    -1         0   unknown      yes          55-64   
4  unknown  ...         2    -1         0   unknown      yes          45-54   

            Perfil month #       

In [54]:
import numpy as np

# Calcular mediana y percentil 75
mediana = np.median(df_bank['balance'])
percentil_75 = np.percentile(df_bank['balance'], 75)

print("📌 Mediana:", mediana)
print("📌 Percentil 75:", percentil_75)


📌 Mediana: 551.5
📌 Percentil 75: 1716.0


In [55]:
def balance_group(balance):
    if balance < 0:
        return 'Negativo'
    elif balance <= mediana:
        return 'Bajo'
    elif balance <= percentil_75:
        return 'Medio'
    else:
        return 'Alto'

df_bank['BalanceGroup'] = df_bank['balance'].apply(balance_group)


In [56]:
def balance_category(group):
    if group in ['Negativo', 'Bajo']:
        return 'Vulnerables'
    elif group in ['Medio', 'Alto']:
        return 'Estables'
    else:
        return 'Indeterminado'

df_bank['BalanceCategory'] = df_bank['BalanceGroup'].apply(balance_category)


In [57]:
print(df_bank['BalanceGroup'].value_counts())
print(df_bank['BalanceCategory'].value_counts())


BalanceGroup
Bajo        11449
Medio        6546
Alto         6542
Negativo     1639
Name: count, dtype: int64
BalanceCategory
Estables       13088
Vulnerables    13088
Name: count, dtype: int64


In [58]:
import numpy as np

# Asegurarte de ignorar los valores nulos en el cálculo
balance_series = df_bank['balance'].dropna()

# Calcular cortes
mediana = np.median(balance_series)
percentil_75 = np.percentile(balance_series, 75)

# Mostrar cortes
print(f"📌 Mediana (corte entre 'Bajo' y 'Medio'): {mediana}")
print(f"📌 Percentil 75 (corte entre 'Medio' y 'Alto'): {percentil_75}")


📌 Mediana (corte entre 'Bajo' y 'Medio'): 551.5
📌 Percentil 75 (corte entre 'Medio' y 'Alto'): 1716.0


In [59]:
df_bank['balance_quartile'] = pd.qcut(df_bank['balance'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
quartis, bins = pd.qcut(df_bank['balance'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'], retbins=True)

print("📌 Limites dos quartis de 'balance':")
print(bins)

📌 Limites dos quartis de 'balance':
[-6847.    123.    551.5  1716.  81204. ]


In [60]:
print("Q1:", df_bank['balance'].quantile(0.25))
print("Q2:", df_bank['balance'].quantile(0.50))
print("Q3:", df_bank['balance'].quantile(0.75))

Q1: 123.0
Q2: 551.5
Q3: 1716.0


In [61]:
tabela = pd.crosstab(df_bank['balance_quartile'], df_bank['default'])
print("📊 Tabela de Contingência:")
print(tabela)

📊 Tabela de Contingência:
default             no  yes
balance_quartile           
Q1                6215  328
Q2                6495   39
Q3                6517   25
Q4                6539    3


In [62]:
from scipy.stats import chi2_contingency

chi2, p, dof, expected = chi2_contingency(tabela)

print("\n📈 Resultado do Teste Chi-cuadrado:")
print(f"Chi²: {chi2:.4f}")
print(f"Valor p (científico): {p:.2e}")
print(f"Graus de liberdade: {dof}")
print("Frequências esperadas:")
print(pd.DataFrame(expected, index=tabela.index, columns=tabela.columns))

# 4. Interpretação
alpha = 0.05
if p < alpha:
    print("\n✅ Há uma relação significativa entre o quartil de saldo e o default.")
else:
    print("\n❌ Não há evidência suficiente de relação entre o quartil de saldo e o default.")



📈 Resultado do Teste Chi-cuadrado:
Chi²: 726.8101
Valor p (científico): 3.22e-157
Graus de liberdade: 3
Frequências esperadas:
default                    no        yes
balance_quartile                        
Q1                6444.208478  98.791522
Q2                6435.344368  98.655632
Q3                6443.223577  98.776423
Q4                6443.223577  98.776423

✅ Há uma relação significativa entre o quartil de saldo e o default.
