In [1]:
import string
import nltk
import pandas as pd
import pickle
import sys
import numpy as np

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix, dok_matrix
from scipy.spatial.distance import minkowski, cosine

# Textual Data 

This project includes many steps of data processing
1. First the data is structured so that it can be easily processed with fewer memory
    - Only the title data will be used
2. The documents will be processed in order to generate their TF-IDF representation 
3. The SVD representation will be generated in order to compress the space used 

### Data Structuring

Chaging the tabled format of the data into an hashing structure that contains all the words and a list of sets with the words present in each document. <br>
Punctuation removed.

In [None]:
pd.read_csv("data/arxiv_data.csv").head()

In [None]:
titles = pd.read_csv("data/arxiv_data.csv").titles#.head()

In [None]:
def get_words(document):
    for word in nltk.tokenize.word_tokenize(document):
        if word not in string.punctuation:
            yield word.lower()

In [None]:
all_words = {}
docs = []
index = 0
for t in titles:
    current_doc = []
    for word in get_words(t):
        if word not in all_words:
            all_words|= {word:index}
            index+=1
        current_doc.append(word)
    docs.append(current_doc)

In [None]:
with open("data/arxiv_data.pickle", "wb") as f:
    pickle.dump((all_words, docs), f)

In [51]:
len(all_words), len(docs)

(25299, 51774)

### TF-IDF

The text data is transformed into TF-IDF structure

In [2]:
all_words, docs = pickle.load(open("data/arxiv_data.pickle", "rb"))

In [3]:
def count_words(words: list[str]) -> dict[str, int]:
    """
    Counts the ocurrence of each word in the document corpus.
    """
    return dict(zip(*np.unique(words, return_counts=True)))
    
    
def tf(word_counts: dict, i: int, TF_Matrix) -> None:
    """
    Computes the Term-Frequency vector of a document. 
    Operates in-memory in the Term-Frequency Matrix, receiving the index {i} that corresponds to the document.
    """
    counts = word_counts.values()
    if len(counts)==0: return {}
    max_value = max(counts)
    
    for word, counts in word_counts.items():
        TF_Matrix[i, all_words[word]] = counts/max_value
    
    
def calc_sparse_tf_matrix(docs: list[list[str]], TF_Matrix):
    """
    Computes the Term-Frequency Matrix
    """
    for i, doc in enumerate(docs):
        word_counts = count_words(doc)
        tf(word_counts, i, TF_Matrix)
    return TF_Matrix

In [4]:
def calc_idf(documents, IDF_Matrix):
    """
    Computes the Inverse Document Frequency in-memory.
    """
    N = len(documents)
    
    for doc in documents:
        for word in np.unique(doc):
            IDF_Matrix[ 0, all_words[word] ] += 1
    
    for i in range(IDF_Matrix.shape[1]):
        IDF_Matrix[0, i] = np.log2(N / IDF_Matrix[0, i] )
    return IDF_Matrix
    

In [5]:
def calc_tf_idf(all_words: dict[str, int],
                documents: list[list[str]]):
    
    TF = dok_matrix(np.zeros((len(docs), len(all_words))))
    IDF = dok_matrix(np.zeros((1, len(all_words))))
    
    TF = calc_sparse_tf_matrix(documents, TF).tocsr()
    IDF = calc_idf(documents, IDF).tocsr()
    
    print("TF size =", sys.getsizeof(pickle.dumps(TF))/1024**2, "MB")
    print("IDF size =", sys.getsizeof(pickle.dumps(IDF))/1024**2, "MB")
    
    TF_IDF = TF.multiply(IDF)
    return TF_IDF
    

In [6]:
TF_IDF = calc_tf_idf(all_words, docs)

TF size = 5.424110412597656 MB
IDF size = 0.28991127014160156 MB


In [7]:
sys.getsizeof(pickle.dumps(TF_IDF))/1024**2

5.424110412597656

### From the sparse TF-IDF Matrix, create the Singular Decomposition Values

The matrix is decomposed into R context values

In [8]:
U, S, V = svds(TF_IDF, k=30)

In [9]:
U.shape, S.shape, V.shape

((51774, 30), (30,), (30, 25299))

In [11]:
compressed_docs = (np.diag(S).T @ V)

In [19]:
k = 1
dists = [ cosine(
                compressed_docs[:, k], 
                compressed_docs[:, i]
            ) for i in range(compressed_docs.shape[1])
        ]
