# Bag of Words

The **Bag-of-Words (BoW)** representation is a basic technique in natural language processing (NLP) that converts text into a set of numbers. It calculates how often words appear in a document, ignoring the order and grammar.

In this approach, each document (e.g., a tweet) is represented as a vector. The length of the vector is equal to the size of the vocabulary, and each position in the vector represents a specific word in the vocabulary.

In [5]:
import numpy as np

In [6]:
import nltk
#nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
def get_texts_from_file(path_corpus,path_thruth):
    """
    Reads a corpus and its correspondig labels from files.
    Args:
        path_corpus (.txt): Path to the corpus file.
        path_thruth (.txt): Path to the labels file.
    Returns:
        tr_txt (list): List of tweets from the corpus.
        tr_y (list): List of labels corresponding to the tweets.
    """
    tr_txt=[]
    tr_y=[]    
    with open(path_corpus, 'r') as f_corpus, open(path_thruth, 'r') as f_thruth:
        for twitt in f_corpus:
            tr_txt.append(twitt) 
        for label in f_thruth:
            tr_y.append(int(label)) 
    return tr_txt,tr_y
tr_txt,tr_y=get_texts_from_file('./mex20_train.txt','./mex20_train_labels.txt')
tokenizer=TweetTokenizer()


In [7]:
from nltk.tokenize import RegexpTokenizer
import nltk

tokenizer = RegexpTokenizer(r'\w+') 
corpus = ' '.join(tr_txt)
tokens_regexp = tokenizer.tokenize(corpus) #tokekize the corpus using RegexpTokenizer
text_nltk = nltk.Text(tokens_regexp) 

In [10]:
def sortFreqDict(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux
corpus_palabras = [token for doc in tr_txt for token in tokenizer.tokenize(doc)]

fdist = nltk.FreqDist(corpus_palabras)
V = sortFreqDict(fdist)  
#V1=V[:5000]
dict_indices1= dict()
cont=0
for width,word in V:
    dict_indices1[word]=cont
    cont+=1


def build_bow_tr(tr_txt, V, dic_indices, mode="binary"):
    BoW = np.zeros((len(tr_txt), len(V)), dtype=float) 

    for i, doc in enumerate(tr_txt):
        tokens = [token.lower() for token in tokenizer.tokenize(doc)]
        fdist = nltk.FreqDist(tokens)  
        total_words = sum(fdist.values())  
        
        for word, freq in fdist.items():
            if word in dic_indices:
                index = dic_indices[word]
                if mode == "binary":
                    BoW[i, index] = 1  
                elif mode == "freq":
                    BoW[i, index] = freq / total_words  
                    
    return BoW

BoW_tr = build_bow_tr(tr_txt, V, dict_indices1, mode="freq") 
BoW_tr.shape

(5278, 14627)