# Prueba de independencia

### Modelos no paramétricos y de regresión
##### Por: Jorge Iván Reyes Hernández

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2

In [2]:
class IndependenceTest:
    def __init__(self, table):
        assert isinstance(table, pd.core.frame.DataFrame)
        self.__table = table
        self.__shape = table.shape
        self.__n_dot_i = table.sum(axis=0)
        self.__n_i_dot = table.sum(axis=1)
        self.__n = self.__n_dot_i.sum()
        self.__expected_values = np.zeros(shape=self.__shape)
        self.__compute_expected_values()
    
    def __compute_expected_values(self):
        for i in range(self.__n_i_dot.size):
            for j in range(self.__n_dot_i.size):
                self.__expected_values[i][j] = self.__n_i_dot[i] * self.__n_dot_i[j] / self.__n
        
    def __str__(self):
        text = f"Tabla de contingencia de {self.__shape[0]} x {self.__shape[1]}\n\n"
        text += str(self.__table)
        return text

    def __observed_chi(self):
        jiji = 0
        
        for i in range(self.__shape[0]):
            for j in range(self.__shape[1]):
                jiji += ((self.__table.iloc[i][j] - self.__expected_values[i][j]) ** 2) / self.__expected_values[i][j]
        
        df = (self.__shape[0] - 1) * (self.__shape[1] - 1)

        return jiji, df
    
    def __measures_of_association(self):
        pass
        
    def run_test(self, alpha):
        jiji, df = self.__observed_chi()
        c_alpha = chi2.ppf(q=1-alpha, df=df)
        print(f"H_0: 'Las variables son independientes' v.s H_a: 'Las variables están asociadas'")
        print(f"Rechazar H_0 si: jiji > {c_alpha:.3f}")
        print(f"Valor que tomó la estadística jiji: {jiji:.3f}")
        decision = "-> Rechazamos H_0" if jiji > c_alpha else "->No rechazamos H_0"
        print(decision + f" con nivel de significancia {alpha*100}%")
        
        if decision == "-> Rechazamos H_0":
            print(self.__measures_of_association)


In [4]:
Y = ["Ingeniería", "Artes y Ciencias", "Economía", "Otras"]
X = ["Local", "Foráneo"]
table = np.array([[16, 14, 13, 13], [14, 6, 10, 8]])
data = pd.DataFrame(table, columns=Y, index=X)


In [5]:
it = IndependenceTest(data)

In [6]:
print(it)

Tabla de contingencia de 2 x 4

         Ingeniería  Artes y Ciencias  Economía  Otras
Local            16                14        13     13
Foráneo          14                 6        10      8


In [7]:
it.run_test(0.05)

H_0: 'Las variables son independientes' v.s H_a: 'Las variables están asociadas'
Rechazar H_0 si: jiji > 7.815
Valor que tomó la estadística jiji: 1.524
->No rechazamos H_0 con nivel de significancia 5.0%
