# Recommendation System

The goal here is to build a paper recommendation system using the embeddings spaces created by the classifiers. 

In [1]:
import torch
from classifier.article_dataset import ArticleDataset
from torch.utils.data import DataLoader
from classifier.models.mlp_classifier import MLPClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import umap
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys, os
sys.path.append('../classifier')  # Add parent directory to Python path
from utils import custom_collate
from models.mlp_classifier import MLPClassifier
from models.bilstm_classifier import BiLSTMClassifier
from models.bilstmattention_classifier import BiLSTMAttentionClassifier

In [23]:
folder = "mlp"
experiment_name = "mlp_summary_fulldb"
checkpoint_path = os.path.join('..', 'classifier', 'experiments', folder, experiment_name, f'{experiment_name}.pth')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load(checkpoint_path, map_location=device)
hparams = checkpoint['hyperparameters']
checkpoint.keys(), hparams.keys()

(dict_keys(['model_state_dict', 'hyperparameters', 'dataset_filters']),
 dict_keys(['vocab_size', 'embedding_dim', 'hidden_dim', 'num_classes', 'num_hidden_layers', 'dropout', 'freeze_embeddings']))

In [24]:
model = MLPClassifier(
    vocab_size=hparams['vocab_size'],
    embedding_dim=hparams['embedding_dim'],
    hidden_dim=hparams['hidden_dim'],
    num_classes=hparams['num_classes'],
    num_hidden_layers=hparams['num_hidden_layers'],
    dropout=hparams['dropout']
).to(device)

model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

MLPClassifier(
  (embedding): Embedding(38894, 128)
  (dropout): Dropout(p=0.3, inplace=False)
  (input_layer): Linear(in_features=128, out_features=128, bias=True)
  (hidden_layers): ModuleList(
    (0): Linear(in_features=128, out_features=128, bias=True)
  )
  (output_layer): Linear(in_features=128, out_features=20, bias=True)
)

In [29]:
csv_file = "../classifier/data/articles.csv"
use_summary = True  
classification_level = "category"  # ou "sub_category"
selected_categories = None  # ou None pour toutes les catégories

dataset = ArticleDataset(csv_file, use_summary=use_summary,
                         classification_level=classification_level,
                         selected_categories=selected_categories)

filters = checkpoint.get('dataset_filters', {"min_freq": 5})
dataset.apply_filters(filters)

test_loader = DataLoader(dataset, batch_size=64, collate_fn=custom_collate)

In [30]:
word_embeddings = model.embedding.weight.detach()

In [31]:
dataloader = DataLoader(dataset, batch_size=64, collate_fn=custom_collate, shuffle=False)

In [34]:
def embed_all_in_batches():
    model.eval()
    all_emb = []
    with torch.no_grad():
        for padded_sequences, _ in dataloader:
            # move inputs to GPU
            padded_sequences = padded_sequences.to(device)
            # get embeddings on GPU
            emb = model.embedding(padded_sequences)  # [batch, seq_len, emb_dim]
            # build mask on same device
            mask = (padded_sequences != 0).unsqueeze(-1).float().to(device)
            emb = emb * mask
            # sum and average on GPU
            sum_emb = emb.sum(dim=1)
            lengths = mask.sum(dim=1)
            avg_emb = sum_emb / lengths.clamp(min=1)
            all_emb.append(avg_emb)
    # concatenation remains on GPU
    return torch.cat(all_emb, dim=0)

emb = embed_all_in_batches()

KeyboardInterrupt: 

In [35]:
emb.shape

torch.Size([1232614, 128])

## Recommendation system

now we have an embedding vector for each paper, we can use these to find similar papers.

### First, we start with the cosine similarity

In [36]:
import faiss

# normalization L2
emb_norm = emb.numpy().astype('float32')
faiss.normalize_L2(emb_norm)

# faiss index
d = emb_norm.shape[1]
index = faiss.IndexFlatIP(d)  
index.add(emb_norm)

def recommend_similar_papers(paper_idx: int, top_k: int = 5):
    query = emb_norm[[paper_idx]]
    D, I = index.search(query, top_k + 1)   
    idxs, sims = [], []
    for i, sim in zip(I[0], D[0]):
        if i != paper_idx and len(idxs) < top_k:
            idxs.append(int(i))
            sims.append(float(sim))
    return list(zip(idxs, sims))

In [42]:
# test
paper_idx = 0
recommendations = recommend_similar_papers(0, top_k=5)

print(f"Recommendations for paper: \n {dataset.data.iloc[paper_idx]['title']}")
print()
for idx, sim in recommendations:
    summary = dataset.data.iloc[idx]['title']
    print(f"title: {summary}")
    print(f"Similarity: {sim:.4f}")
    print()

Recommendations for paper: 
 Coexistence of distinct mobility edges in a 1D quasiperiodic mosaic
  model

title: Emergence of a superglass phase in the random hopping Bose-Hubbard model
Similarity: 0.6029

title: Solitonic in-gap modes in a superconductor-quantum antiferromagnet
  interface
Similarity: 0.5991

title: Bose polarons in ultracold atoms in one dimension: beyond the Fröhlich
  paradigm
Similarity: 0.5945

title: GPU Accelerated Discrete Element Method (DEM) Molecular Dynamics for
  Conservative, Faceted Particle Simulations
Similarity: 0.5916

title: Stochastic pump of interacting particles
Similarity: 0.5848



### Now we want to find the most similar paper to a given title not necessarily present in the dataset.

we embed the given using our classifier, then we find the most similar paper in the dataset using cosine similarity.

In [67]:
import re

def embed(text):
    words = re.findall(r'[a-z0-9]+', text.lower())
    idxs = [dataset.word_to_index.get(word, 0) for word in words]
    for w in words:
        if w not in dataset.word_to_index:
            print(f"Word '{w}' not found in vocabulary.")
    w_embs = word_embeddings[idxs] # [seq_len, embedding_dim]
    title = torch.mean(w_embs, dim=0)  # [embedding_dim]
    return title.cpu().numpy().astype('float32')

def recommend_by_title_embed(query_title: str, top_k: int = 5):
    q_emb = embed(query_title)
    # reshape to (1, dim) so FAISS can process it
    q_emb = q_emb.reshape(1, -1)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)   # on récupère top_k voisins
    return [(dataset.data.iloc[i]['title'], float(D[0, j]))
            for j, i in enumerate(I[0])]

ex_title = "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair"
ex_summary = """During Automated Program Repair (APR), it can be challenging to synthesize correct patches for real-world systems in general-purpose programming languages. Recent Large Language Models (LLMs) have been shown to be helpful "copilots" in assisting developers with various coding tasks, and have also been directly applied for patch synthesis. However, most LLMs treat programs as sequences of tokens, meaning that they are ignorant of the underlying semantics constraints of the target programming language. This results in plenty of statically invalid generated patches, impeding the practicality of the technique. Therefore, we propose Repilot, a general code generation framework to further copilot the AI "copilots" (i.e., LLMs) by synthesizing more valid patches during the repair process. Our key insight is that many LLMs produce outputs autoregressively (i.e., token by token), resembling human writing programs, which can be significantly boosted and guided through a Completion Engine. Repilot synergistically synthesizes a candidate patch through the interaction between an LLM and a Completion Engine, which 1) prunes away infeasible tokens suggested by the LLM and 2) proactively completes the token based on the suggestions provided by the Completion Engine. Our evaluation on a subset of the widely-used Defects4j 1.2 and 2.0 datasets shows that Repilot outperforms state-of-the-art techniques by fixing 27% and 47% more bugs, respectively. Moreover, Repilot produces more valid and correct patches than the base LLM with the same budget. While we focus on leveraging Repilot for APR in this work, the overall approach is also generalizable to other code generation tasks."""
recommendations = recommend_by_title_embed(ex_summary, top_k=10)
print(f"Recommendations for title: {ex_title}")
print()
for title, sim in recommendations:
    print(f"{title}")
    print(f"Similarity: {sim:.4f}")
    print()


Word 'however' not found in vocabulary.
Word 'ignorant' not found in vocabulary.
Word 'impeding' not found in vocabulary.
Word 'therefore' not found in vocabulary.
Word 'repilot' not found in vocabulary.
Word 'autoregressively' not found in vocabulary.
Word 'repilot' not found in vocabulary.
Word 'synergistically' not found in vocabulary.
Word 'synthesizes' not found in vocabulary.
Word 'prunes' not found in vocabulary.
Word 'completes' not found in vocabulary.
Word 'repilot' not found in vocabulary.
Word 'moreover' not found in vocabulary.
Word 'repilot' not found in vocabulary.
Word 'repilot' not found in vocabulary.
Recommendations for title: Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair

Copiloting the Copilots: Fusing Large Language Models with Completion
  Engines for Automated Program Repair
Similarity: 0.8389

TZ4Fabric: Executing Smart Contracts with ARM TrustZone
Similarity: 0.6729

Look Before You Leap: Enhancing A

## Observations 

It's useful if and only if one has a vague idea of a title and wants to find it. So it's basically a quick search engine.

Otherwise, it doesn't recommend anything useful, it's just close semanticly. 

It can also be useful if you want to get a list of papers in a broad area of research ex : recommendation system, kernel methods, etc.