In [None]:
import os
import sys

import pandas as pd
from git_root import git_root
from sentence_transformers import SentenceTransformer

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
my_git_root = git_root()
sys.path.append(my_git_root)

In [None]:
# Import Topics for each Document
df_documents_path = f'{my_git_root}/rag_docs/topic_documents.csv'
df_documents = pd.read_csv(df_documents_path)

In [None]:
documents = df_documents['text'].to_list()

In [None]:
embedding_model = SentenceTransformer('thenlper/gte-small', trust_remote_code=True)

In [None]:
document_vectors = {}

for index, document in enumerate(documents):
    document_vectors[index] = embedding_model.encode(document)

In [None]:
import numpy as np
from numpy.linalg import norm

def closest_n_vectors(vector_dict, new_vector, n=1):
    """
    Find the n closest vectors in the dictionary to the new vector using cosine similarity.

    Parameters:
        vector_dict (dict): A dictionary where keys are indices and values are vectors (lists or numpy arrays).
        new_vector (list or numpy array): The vector to compare against.
        n (int): The number of closest vectors to return.

    Returns:
        list: A list of tuples containing the index and the closest vectors from the dictionary.
    """
    # Ensure the new vector is a numpy array
    new_vector = np.array(new_vector)

    # Compute cosine similarity for all vectors in the dictionary
    similarities = []
    for index, vector in vector_dict.items():
        vector = np.array(vector)
        similarity = np.dot(vector, new_vector) / (norm(vector) * norm(new_vector))
        similarities.append((index, similarity))

    # Sort by similarity in descending order and get the top n results
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_n = similarities[:n]

    return [idx for idx, _ in top_n]

# Example usage:
vector_dict = {
    0: [1, 0, 0],
    1: [0, 1, 0],
    2: [0, 0, 1],
    3: [0.5, 0.5, 0]
}
new_vector = [0.9, 0.1, 0]

closest_vectors = closest_n_vectors(vector_dict, new_vector, n=2)
print(f"Closest vectors: {closest_vectors}")

In [None]:
from transformers import BitsAndBytesConfig
import torch

use_4bit = True
use_8bit = False

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    load_in_8bit=use_8bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)
model_id = "tiiuae/Falcon3-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 2000,
    "return_full_text": False,
    "temperature": None,
    "do_sample": False,
}



In [None]:
def get_context(user_input, document_vectors, df_documents, n):
    user_embed = embedding_model.encode(user_input)
    relevant_documents_indices = closest_n_vectors(document_vectors, user_embed, n=n)

    context = ''
    for relevant_document_index in relevant_documents_indices:
        context += f'{df_documents.iloc[relevant_document_index]["name"]}:\n{df_documents.iloc[relevant_document_index]["text"]}\n\n'
    print(context)
    return context


In [None]:
user_input = "Please tell me how the documents implement the FAIR principles in practice. Give me three examples, and use quotations from the documents."

policy_name = None

In [None]:
def ask_llm(user_input, document_vectors, df_documents, pipe, n, policy_name=None):
    context = get_context(user_input, document_vectors, df_documents, n)
    if context == '':
        return 'I was not able to find any documents for you.'
    print(context)
    prompt = f'Please answer the following question based on the given context:\n{user_input}\nContext:\n{context}'
    return pipe(prompt, **generation_args)[0]['generated_text']

In [None]:
output = ask_llm(user_input, document_vectors, df_documents, pipe, 20)

In [None]:
from IPython.display import Markdown, display

display(Markdown(output))

In [30]:
user_input = "Name all universities implementing persistent identifiers for data. Please use exact quotes from the documents."

policy_name = None

In [31]:
output = ask_llm(user_input, document_vectors, df_documents, pipe, 100)

johannes-kepler-university-linz.md:
Research data are stored and made available in a suitable repository or archiving system and are referenced using persistent identifiers.

technical-university-vienna.md:
1.6. Persistent identifiers are identifiers attached to publications and research data that make them easier to link to and find. They must be unique and remain associated with the correct version of the publication or research data. Ideally, they provide access to digital objects through a resolver service, as digital object identifiers (DOIs) and handles do.

universität-bayreuth.md:
Research data should be stored with suitable metadata in a trustworthy, subject-specific repository or data centre/archive system or the institutional research data repository of the University of Bayreuth RDSpace@UBT16 for the long term and made as openly accessible as possible. To ensure long-term citability of research data, the use of persistent identifiers (e.g. DOI) is recommended. This makes th

In [None]:
print(output)

In [34]:
from IPython.display import Markdown, display

display(Markdown(output))

<|assistant|>
Based on the provided context, the following universities have implemented persistent identifiers for data:

- Johannes Kepler University Linz
- Technical University Vienna
- University of Bayreuth
- University of Hull
- University of Groningen
- Technical University Berlin
- University of St Andrews
- University of Graz
- University of Helsinki
- University of Westminster
- University of Bradford
- University of Utrecht
- Free University Berlin
- University of Cambridge
- University of Nottingham
- University of Copenhagen
- University of Essex
- University of East London
- The University of Exeter
- Technical University Delft
- University of Hull
- Royal College of Art
- University of Nottingham
- University of Greenwich
- University of Hertfordshire
- Radboud University
- Robert Gordon University
- Loughborough University
- University of Göttingen
- Leeds Trinity University
- University of Cambridge
- Ulster University
- University of Salford
- Brunel University London
- University of Groningen
- Glasgow Caledonian University
- University of East Anglia
- De Montfort University
- University of Cambridge
<|assistant|>
The universities that have implemented persistent identifiers for data, based on the provided context, are:

- Johannes Kepler University Linz
- Technical University Vienna
- University of Bayreuth
- University of Hull
- University of Groningen
- Technical University Berlin
- University of St Andrews
- University of Graz
- University of Helsinki
- University of Westminster
- University of Bradford
- University of Utrecht
- Free University Berlin
- University of Cambridge
- University of Nottingham
- University of Copenhagen
- University of Essex
- University of East London
- The University of Exeter
- Technical University Delft
- University of Hull
- Royal College of Art
- University of Nottingham
- University of Greenwich
- University of Hertfordshire
- Radboud University
- Robert Gordon University
- Loughborough University
- University of Göttingen
- Leeds Trinity University
- University of Cambridge
- Ulster University
- University of Salford
- Brunel University London
- University of Groningen
- Glasgow Caledonian University
- University of East Anglia
- De Montfort University
- University of Cambridge