In [None]:
import nltk
import numpy as np
from collections import defaultdict
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Download required NLTK data
nltk.download('punkt')

In [None]:
def preprocess_text(file_path):
    """
    Read and preprocess the text from a file.
    
    Args:
    file_path (str): Path to the text file
    
    Returns:
    list: List of preprocessed tokens
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
    
    # Tokenize without removing stopwords
    tokens = word_tokenize(text.lower())
    
    # Only remove non-alphanumeric tokens
    tokens = [token for token in tokens if token.isalnum()]
    
    return tokens

def create_term_term_matrix(tokens, window_size=5):
    """
    Create a term-term matrix based on co-occurrence within a specified window.
    
    Args:
    tokens (list): List of preprocessed tokens
    window_size (int): Size of the co-occurrence window
    
    Returns:
    tuple: (term-term matrix, vocabulary)
    """
    vocab = sorted(set(tokens))
    word_to_id = {word: i for i, word in enumerate(vocab)}
    
    matrix = np.zeros((len(vocab), len(vocab)), dtype=int)
    
    for i in range(len(tokens)):
        for j in range(max(0, i-window_size), min(len(tokens), i+window_size+1)):
            if i != j:
                word1, word2 = tokens[i], tokens[j]
                matrix[word_to_id[word1], word_to_id[word2]] += 1
    
    return matrix, vocab


In [None]:

def plot_heatmap(matrix, vocab, title, top_n=50):
    """
    Plot a heatmap of the term-term matrix for the top N most frequent terms.
    
    Args:
    matrix (numpy.ndarray): Term-term matrix
    vocab (list): Vocabulary list
    title (str): Title for the heatmap
    top_n (int): Number of top terms to include in the heatmap
    """
    # Get the top N most frequent terms
    term_frequencies = matrix.sum(axis=1)
    top_indices = term_frequencies.argsort()[-top_n:][::-1]
    
    top_matrix = matrix[top_indices][:, top_indices]
    top_vocab = [vocab[i] for i in top_indices]
    
    plt.figure(figsize=(15, 13))
    sns.heatmap(top_matrix, xticklabels=top_vocab, yticklabels=top_vocab, cmap='YlOrRd')
    plt.title(f'{title} (Top {top_n} Terms)')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
# Main execution
br_constitution_path = 'path/to/brazilian_constitution.txt'
us_constitution_path = 'path/to/us_constitution.txt'

In [None]:
# Process Brazilian Constitution
br_tokens = preprocess_text(br_constitution_path)
br_matrix, br_vocab = create_term_term_matrix(br_tokens)
plot_heatmap(br_matrix, br_vocab, 'Brazilian Constitution Term-Term Matrix')


In [None]:
# Process US Constitution
us_tokens = preprocess_text(us_constitution_path)
us_matrix, us_vocab = create_term_term_matrix(us_tokens)
plot_heatmap(us_matrix, us_vocab, 'US Constitution Term-Term Matrix')


In [None]:
# Print some statistics
print(f"Brazilian Constitution: {len(br_tokens)} tokens, {len(br_vocab)} unique terms")
print(f"US Constitution: {len(us_tokens)} tokens, {len(us_vocab)} unique terms")


In [None]:
# Example: Find top co-occurring terms for a specific word in Brazilian Constitution
word = 'de'  # Portuguese for 'of'
if word in br_vocab:
    word_id = br_vocab.index(word)
    co_occurrences = br_matrix[word_id]
    top_10 = sorted(zip(br_vocab, co_occurrences), key=lambda x: x[1], reverse=True)[:11]
    print(f"\nTop 10 co-occurring terms with '{word}' in Brazilian Constitution:")
    for term, count in top_10[1:]:  # Skip the first one as it's the word itself
        print(f"{term}: {count}")
else:
    print(f"'{word}' not found in Brazilian Constitution vocabulary")


In [None]:
# Repeat for US Constitution with an English word
word = 'of'
if word in us_vocab:
    word_id = us_vocab.index(word)
    co_occurrences = us_matrix[word_id]
    top_10 = sorted(zip(us_vocab, co_occurrences), key=lambda x: x[1], reverse=True)[:11]
    print(f"\nTop 10 co-occurring terms with '{word}' in US Constitution:")
    for term, count in top_10[1:]:  # Skip the first one as it's the word itself
        print(f"{term}: {count}")
else:
    print(f"'{word}' not found in US Constitution vocabulary")