# Chi squared and tcor.




In [3]:
# 📌 This notebook assumes that corpus processing, tokenization and BoW construction was already performed on the notebook:
# 👉 'feature-extraction/bag_of_words.ipynb'

#The variables used here (such as `BoW_tr`, `tr_txt`, `V1`, `dict_indices1`) were built there.
#If you want to re-run the pipeline from scratch, check that file first.

> 🔗 **Note:** The corpus loading, tokenization and construction of the Bag of Words is at
> [`bag_of_words.ipynb`](./feature-extraction/bag_of_words.ipynb)

## TCOR
TCOR measures relationships between terms (co-occurrence), weighted by:
- tf (frequency): 1 + log(freq)
- itf (inverse of co-occurrence frequency): log(total_terms / (docs_with_term + 1))

Then, it is made into a standard form using columns (cosine).
This matrix organizes each word into a group based on how it is connected to other words.

In [2]:
def compute_tcor(BoW):
    docs, terms = BoW.shape
    co_ocur_matrix = np.dot(BoW.T, BoW) #Matriz de co-ocurrencias
    TCOR = np.zeros_like(co_ocur_matrix, dtype=float)
    co_ocur_term = np.count_nonzero(co_ocur_matrix, axis=1)

    for term in range(terms):
        for i in range(terms):
            freq = co_ocur_matrix[term, i]
            if freq > 0:
                tf = 1 + np.log(freq)
                itf = np.log(terms / (co_ocur_term[term] + 1)) #Calculo tf-idf
                TCOR[term, i] = tf * itf
            else:
                TCOR[term, i] = 0

    for i in range(terms): #normalizacion por cos
        norm_factor = np.sqrt(np.sum(TCOR[:, i] ** 2))
        if norm_factor > 0:
            TCOR[:, i] /= norm_factor

    return TCOR

In [3]:
TCOR_base = compute_tcor(BoW_tr)
print("Matriz TCOR calculada con dimensiones:", TCOR_base.shape)

Matriz TCOR calculada con dimensiones: (5000, 5000)


## Chi squared
For each word, calculate its chi-square statistic. This statistic measures how relevant the word is for distinguishing between different categories.
Compare the observed and expected frequencies in each class.
The higher the value, the more it matches the characteristics of a specific class.

In [4]:
def chiSquare(BoW, y):
    chi2s = np.zeros(BoW.shape[1])
    docs, terms = BoW.shape
    for i in range(terms):
        obs = np.zeros(len(np.unique(y)))
        expt = np.zeros(len(np.unique(y)))
        for idx, cls in enumerate(np.unique(y)):
            obs[idx] = BoW[y == cls, i].sum()    
        total_obs = obs.sum()
        for idx, cls in enumerate(np.unique(y)):
            cls_prob = (y == cls).sum() / docs
            expt[idx] = total_obs * cls_prob
        non_zero_expected = expt != 0
        chi2s[i] = np.sum(((obs[non_zero_expected] - expt[non_zero_expected]) ** 2) /expt[non_zero_expected])
    return chi2s

In [5]:
k=1000
chi2s = chiSquare(BoW_tr, tr_y)
best_idx = np.argsort(chi2s)[-k:]
dict_indices_invertido = {v: k for k, v in dict_indices1.items()}
target_words = [dict_indices_invertido[index] for index in best_idx]
t_words = target_words
target_matriz = np.array([TCOR_base[dict_indices1[word]] for word in t_words])
print(target_matriz.shape)

(1000, 5000)
