In [192]:
# preprocess everything
import json

# Specify the path to your JSON file
json_file_path = 'combined.json'

# Open and read the JSON file
with open(json_file_path, 'r') as file:
    # Load JSON data from the file
    data = json.load(file)

latex_list = [item["latex"] for item in data]
output_list = [item["output"] for item in data]

In [230]:
from eli5.sklearn import InvertableHashingVectorizer
from eli5.sklearn import FeatureUnhasher
from sklearn.feature_extraction.text import HashingVectorizer
import math
import numpy as np

class DocumentVectorizer:
    def __init__(self):
        self.internal_vec = HashingVectorizer(tokenizer=self.tokenize)
        self.vectorizer = InvertableHashingVectorizer(self.internal_vec)
        self.feature_names = None
        
        # Initialize a dynamic vocabulary and document frequencies
        self.vocabulary = set()
        self.document_frequencies = {}
        self.idf_values = {}
        self.term_indices = {}
        
        self.total_documents = 0

    def calculate_idf(self, term, document_frequencies, total_documents):
        df = document_frequencies[term]
        idf = math.log((total_documents + 1) / (df + 1)) + 1
        return idf
        
    def fit_transform(self, documents):
        """
        Fit the vectorizer on the provided documents and transform them into feature vectors.
        """
        self.total_documents = len(documents)
        tf_matrix = self.vectorizer.fit_transform(documents)
        
        self.feature_names = self.get_feature_names()
        
        for new_doc in documents:
            new_terms = self.tokenize(new_doc)
            processed = set()
            for new_term in new_terms:
                if new_term in processed:
                    continue
                processed.add(new_term)
                if new_term not in self.vocabulary:
                    self.vocabulary.add(new_term)
                    self.document_frequencies[new_term] = 1
                else:
                    self.document_frequencies[new_term] += 1
        
        self.idf_values = {term: self.calculate_idf(term, self.document_frequencies, self.total_documents) for term in self.vocabulary}
        
        self.term_indices = {}
        
        for idx, term in enumerate(self.feature_names):
            if type(term) is list:
                for x in term:
                    self.term_indices[x['name']] = idx
                
        return self.transform_internal(tf_matrix)
        
    def transform(self, document):
        """
        Transform the provided documents into feature vectors.
        """
        if self.feature_names is None:
            raise ValueError("Vectorizer has not been fitted. Call fit_transform first.")

        tf_matrix = self.vectorizer.transform(document)
        
        return self.transform_internal(tf_matrix)

    def transform_internal(self, tf_matrix):
        tfidf_matrix = tf_matrix.copy()
        
        for term, idx in self.term_indices.items():
            tfidf_matrix[:, idx] = tf_matrix[:, idx] * self.idf_values[term]
        
        return tfidf_matrix
    
    def get_feature_names(self):
        """
        Get the feature names (terms) used by the vectorizer.
        """
        return self.vectorizer.get_feature_names()
    
    def tokenize(self, input_str):
        symbols = [' ', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', ';', ':', '\'', '\"', ',', '.', '<', '>', '/', '?', '|']

        tokens = []
        curr_token = ''

        for c in input_str:
            if c in symbols:
                if curr_token:
                    tokens.append(curr_token)
                    curr_token = ''
                if c != ' ':
                    tokens.append(c)
            elif c == '\\':
                if curr_token:
                    tokens.append(curr_token)
                    curr_token = ''
                curr_token += c
            else:
                curr_token += c

        if curr_token:
            tokens.append(curr_token)

        return tokens

## test

In [231]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(input_str):
        symbols = [' ', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', ';', ':', '\'', '\"', ',', '.', '<', '>', '/', '?', '|']

        tokens = []
        curr_token = ''

        for c in input_str:
            if c in symbols:
                if curr_token:
                    tokens.append(curr_token)
                    curr_token = ''
                if c != ' ':
                    tokens.append(c)
            elif c == '\\':
                if curr_token:
                    tokens.append(curr_token)
                    curr_token = ''
                curr_token += c
            else:
                curr_token += c

        if curr_token:
            tokens.append(curr_token)

        return tokens

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

for i in range(len(documents)):
    documents[i] = documents[i].lower()

# Create the TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize, use_idf=True, smooth_idf=True, norm='l2', sublinear_tf=False)

# Learn the vocabulary and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the vocabulary mapping
vocabulary = vectorizer.get_feature_names_out()

# Display the TF-IDF values and corresponding vocabulary
print("TF-IDF Matrix:")

print("\nVocabulary Mapping:")
for feature_index, word in enumerate(vocabulary):
    print(f"Feature {feature_index}: {word}")
    for x in tfidf_matrix.toarray():
        print(x[feature_index])

TF-IDF Matrix:

Vocabulary Mapping:
Feature 0: .
0.4252064803598164
0.3251320334825997
0.3105526673072801
0.0
Feature 1: ?
0.0
0.0
0.0
0.592769307628588
Feature 2: and
0.0
0.0
0.4865407641485108
0.0
Feature 3: document
0.4252064803598164
0.6502640669651994
0.0
0.3783569705698032
Feature 4: first
0.5252145958082508
0.0
0.0
0.4673461307572138
Feature 5: is
0.34763415945982157
0.26581674173343006
0.2538971545683301
0.30933161538012166
Feature 6: one
0.0
0.0
0.4865407641485108
0.0
Feature 7: second
0.0
0.509382158560758
0.0
0.0
Feature 8: the
0.34763415945982157
0.26581674173343006
0.2538971545683301
0.30933161538012166
Feature 9: third
0.0
0.0
0.4865407641485108
0.0
Feature 10: this
0.34763415945982157
0.26581674173343006
0.2538971545683301
0.30933161538012166


In [232]:
vectorizer = DocumentVectorizer()

# Learn the vocabulary and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the vocabulary mapping
vocabulary = vectorizer.get_feature_names()

# Display the TF-IDF values and corresponding vocabulary
print("TF-IDF Matrix:")
print()

print("\nVocabulary Mapping:")
feature_num = 0
for feature_index, word in enumerate(vocabulary):
    if type(word) is not list:
        continue
    print(f"Feature {feature_num}: {word[0]['name']}")
    for x in tfidf_matrix.toarray():
        print(x[feature_index])
    
    feature_num += 1

TF-IDF Matrix:


Vocabulary Mapping:
Feature 0: is
0.4082482904638631
0.3333333333333333
0.3779644730092272
0.4082482904638631
Feature 1: this
0.4082482904638631
0.3333333333333333
0.3779644730092272
0.4082482904638631
Feature 2: third
0.0
0.0
0.7242898166052814
0.0
Feature 3: and
0.0
0.0
-0.7242898166052814
0.0
Feature 4: the
-0.4082482904638631
-0.3333333333333333
-0.3779644730092272
-0.4082482904638631
Feature 5: document
-0.4993462638159245
-0.8154290342094731
0.0
-0.4993462638159245
Feature 6: second
0.0
0.638763577291385
0.0
0.0
Feature 7: .
-0.4993462638159245
-0.40771451710473655
-0.4623048077871099
0.0
Feature 8: one
0.0
0.0
-0.7242898166052814
0.0
Feature 9: ?
0.0
0.0
0.0
-0.7823224153193689
Feature 10: first
-0.6167919780914652
0.0
0.0
-0.6167919780914652


In [229]:
## remember that cosine similarity is dot product if we use l2 norm!

In [219]:
vectorizer = DocumentVectorizer()

# Learn the vocabulary and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the vocabulary mapping
vocabulary = vectorizer.get_feature_names()

# Display the TF-IDF values and corresponding vocabulary
print("TF-IDF Matrix:")
print()


KeyboardInterrupt

