# Playing with RAG

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
from IPython.display import display, Markdown
import json
from tqdm.auto import tqdm

from transformers import AutoTokenizer

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Load model

- https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5
- https://huggingface.co/spaces/mteb/leaderboard

Is teh best ranked model with an acceptable size and open.

In [None]:
# Requires sentence_transformers>=2.7.0

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

sentences = ['That is a happy person', 'That is a very happy person']

model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5',
                            trust_remote_code=True,
                            device='cpu')
embeddings = model.encode(sentences)
print(cos_sim(embeddings[0], embeddings[1]))


## Load problems data

In [None]:
test = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_test_5.csv')
test.sort_values('type', inplace=True)
print(test.shape)
test.head()

In [None]:
train = pd.read_csv('/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_train.csv')
train = train[train.level == 'Level 5']
train.sort_values('type', inplace=True)
print(train.shape)
train.head()

In [None]:
test_problems = test.problem.values.copy()
train_problems = train.problem.values.copy()
len(test_problems), len(train_problems)

## Compute embeddings

In [None]:
train_embeddings = model.encode(train_problems)

In [None]:
test_embeddings = model.encode(test_problems)

It is computing around 3 embeddings per second, fast enough for our application (only 50 embeddings need to be computed at test time)

## Study similarity matrix

In [None]:
similarity = cos_sim(test_embeddings, train_embeddings).numpy()
similarity.shape

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(similarity);
plt.xlabel('Train')
plt.ylabel('Test')
plt.title('Cosine-Similarity between test and train problems');

There seems to be some diagonal structure, that is intended because the problems were sorted by type of problem.

## Search for the n most similar problems

In [None]:
def show_n_most_similar_problems(idx, n=5):
    most_similar = np.argsort(similarity[idx])[::-1][:n]

    display_markdown(f"### Test Problem {idx}. {test['type'].values[idx]}\n\n {test_problems[idx]}")
    for i, j in enumerate(most_similar):
        display_markdown(f"### Train Problem {j}. {train['type'].values[j]}, Similarity: {similarity[idx, j]:.2f}")
        display_markdown(f"{train_problems[j]}")

def display_markdown(text):
    display(Markdown(uniform_latex_format(text)))
    # display(Markdown(text))
    # print(text)
    # display(Markdown(uniform_latex_format(text)))
    #print(uniform_latex_format(text))

def uniform_latex_format(text):
    text = text.replace('\\[', ' $ ').replace('\\]', ' $ ')
    text = text.replace('\\(', '$').replace('\\)', '$')
    text = text.replace('\\begin{align*}', ' $$ ').replace('\\end{align*}', ' $$ ')
    return text


show_n_most_similar_problems(50, n=5)

It seems to be working well enough, maybe we need something more advanced, but we should measure how good this is on evaluation compared to random using the problems.

In [None]:
test.problem.values[0]