# Imports

In [2]:
import os
import pickle
import sys

import polars as pl
from git_root import git_root
from sentence_transformers import SentenceTransformer

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
my_git_root = git_root()
sys.path.append(my_git_root)

# Importing Data for RAG

In [3]:
# Import embeddings for each topic

file_path = f'{my_git_root}/rag_docs/topic_embeddings.pkl'

with open(file_path, 'rb') as f:
    topic_embeddings = pickle.load(f)

In [4]:
# Import Topics for each Document
df_document_topics_path = f'{my_git_root}/rag_docs/topic_documents.csv'
df_document_topics = pl.read_csv(df_document_topics_path)

In [5]:
# Import valid data sources
df_documents_path = f'{my_git_root}/final_notebooks/data/chunked_documents_final.csv'
df_documents = pl.read_csv(df_documents_path)

In [6]:
embedding_model = SentenceTransformer('thenlper/gte-small', trust_remote_code=True)

In [7]:
user_input = "The documents in the context are excerpts from Research Data Management Policies. Please give three diverse examples of how data quality and metadata quality is handled and give a few example quotes to make your point."

In [8]:
input_embed = embedding_model.encode(user_input)

In [9]:
import numpy as np
from numpy.linalg import norm

def closest_vector(vector_dict, new_vector):
    """
    Find the vector in the dictionary closest to the new vector using cosine similarity.

    Parameters:
        vector_dict (dict): A dictionary where keys are indices and values are vectors (lists or numpy arrays).
        new_vector (list or numpy array): The vector to compare against.

    Returns:
        tuple: The index and the closest vector from the dictionary.
    """
    # Ensure the new vector is a numpy array
    new_vector = np.array(new_vector)

    # Initialize variables to store the best match
    best_index = None
    best_similarity = -1  # Cosine similarity ranges from -1 to 1

    for index, vector in vector_dict.items():
        vector = np.array(vector)

        # Compute cosine similarity: (A · B) / (||A|| ||B||)
        similarity = np.dot(vector, new_vector) / (norm(vector) * norm(new_vector))

        # Update the best match if this similarity is higher
        if similarity > best_similarity:
            best_similarity = similarity
            best_index = index

    return best_index, vector_dict[best_index]

In [10]:
def get_context(user_input, embedding_model, topic_embeddings, policy_name, df_documents):
    input_embed = embedding_model.encode(user_input)
    closest_idx, closest_vec = closest_vector(topic_embeddings, input_embed)
    df_document_topics_filtered = df_document_topics.filter(pl.col('topic')==closest_idx)
    if policy_name is not None:
        assert policy_name in df_documents, 'Could not find policy "{}"'.format(policy_name)
        df_document_topics_filtered = df_document_topics_filtered.filter(pl.col('name')==policy_name)

    g = df_document_topics_filtered.group_by('name', maintain_order=True).agg(pl.col('text')).to_dicts()

    context = ''

    for entry in g:
        context += f'{entry["name"]}:\n{", ".join(entry["text"])}\n\n'
    return context

In [11]:
from transformers import BitsAndBytesConfig
import torch

use_4bit = True
use_8bit = False

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [12]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    load_in_8bit=use_8bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [13]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)
model_id = "tiiuae/Falcon3-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 2000,
    "return_full_text": False,
    "temperature": None,
    "do_sample": False,
}



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda


# First question

In [14]:
user_input = "Please tell me how the documents implement the FAIR principles in practice. Give me three examples, and use quotations from the documents."

policy_name = None

In [15]:
def ask_llm(user_input, embedding_model, topic_embeddings, df_documents, pipe, policy_name=None):
    context = get_context(user_input, embedding_model, topic_embeddings, policy_name, df_documents)
    if context == '':
        return 'I was not able to find any documents for you.'
    print(context)
    prompt = f'Please answer the following question based on the given context:\n{user_input}\nContext:\n{context}'
    return pipe(prompt, **generation_args)[0]['generated_text']

In [16]:
output = ask_llm(user_input, embedding_model, topic_embeddings, df_documents, pipe, policy_name)

eth-zurich.md:
4 FAIR Principles are internationally recognised guidelines to improve the Findability, Accessibility, Interoperability, and Reuse of digital assets. The principles emphasise machine-actionability (for details see Annex A).15, Art. 6   Publication of Research Data and Programming Code  

1 Publication  

a. Research Data and Programming Code that are considered as directly relevant for a result publication based on Community Standards must be published and deposited in a FAIR repository along with rich, openly available Metadata.  

(i) If there are limitations for sharing relevant raw data online because sharing is technically or economically not feasible, FAIR allows for publishing Metadata only which contain information on how raw data can be accessed if necessary.  

(ii) In the case of long-range data collection projects, the Research Data and Programming Code that are relevant for a result publication may be defined as a subset and may be aggregated.  

b. Research

In [17]:
from IPython.display import Markdown, display

display(Markdown(output))

<|assistant|>
The documents implement the FAIR principles in practice through several key strategies and guidelines. Here are three examples:

1. **Publication and Metadata Management**: The documents emphasize the importance of publishing research data and metadata in a way that adheres to the FAIR principles. For instance, ETH Zurich's guidelines state that research data and programming code must be published along with rich, openly available Metadata, ensuring that they are findable and accessible. This practice involves assigning unique identifiers to data and metadata, which enhances their findability, and using standardized vocabularies to ensure interoperability.

2. **Data Accessibility and Interoperability**: Helmholtz Munich's guidelines highlight the importance of making data accessible through web interfaces or dedicated data publications, which can include open distribution of data and metadata. This approach ensures that data is accessible and interoperable, as it can be accessed by both humans and machines, and is stored in a trusted repository using open file formats and common standards.

3. **Data Reuse and Licensing**: The University of Edinburgh's guidelines stress the need for research data to be reusable, which involves providing clear data usage licenses and detailed provenance information. This ensures that data can be reused ethically and scientifically, and that the conditions for access and reproduction are clearly stated, promoting the FAIR principle of reusability.

These examples illustrate how the documents operationalize the FAIR principles by focusing on findability, accessibility, interoperability, and reusability, ensuring that research data is prepared in a way that maximizes its utility and longevity.

# Second question

In [25]:
user_input = "What are standard practices for data retention and deletion in practice? Please use exact quotes from the documents."

policy_name = None

In [26]:
df_documents

name,text
str,str
"""aalto-university.md""","""Aalto University"""
"""aalto-university.md""","""Aalto University Research Data…"
"""aalto-university.md""","""The research data management p…"
"""aalto-university.md""","""The data management policy sha…"
"""aalto-university.md""","""Ownership of copyright protect…"
…,…
"""wrexham-university.md""","""FAIR Data Where it is lawful …"
"""wrexham-university.md""","""Prifysgol Wrecsam Wrexham Univ…"
"""wrexham-university.md""","""Reporting a Data Incident/Brea…"
"""wrexham-university.md""","""Other Polices, Procedures, Leg…"


In [27]:
output = ask_llm(user_input, embedding_model, topic_embeddings, df_documents, pipe, policy_name)

hasselt-university.md:
9.5 Disposal of research data  

Researchers should destroy research data that are not (no longer) relevant for (re)use, verification, valorization or data collection – at least insofar as legal, contractual or disciplinary regulations don’t deviate herefrom. Besides, researchers leaving Hasselt University should comply with yet additional measures, as set forth in §12.

lund-university.md:
Data eligible for disposal are automatically disposed of at the end of the retention period.

max-delbrück-center-for-molecular-medicine.md:
Deletion of data  

Deletion or destruction of research data and records,either after expiration of the retention period or for legal or ethical reasons, has to be carried out considering contractual obligations of third-party funders and other stakeholders, including collaboration partners.Such actions should be documented and be accessible for future audit. Backup data copies should also be deleted.Automated deletion of research data is

In [28]:
from IPython.display import Markdown, display

display(Markdown(output))

<|assistant|>
"Researchers should destroy research data that are not (no longer) relevant for (re)use, verification, valorization or data collection – at least insofar as legal, contractual or disciplinary regulations don’t deviate herefrom."

"Data eligible for disposal are automatically disposed of at the end of the retention period."

"Deletion or destruction of research data and records,either after expiration of the retention period or for legal or ethical reasons, has to be carried out considering contractual obligations of third-party funders and other stakeholders, including collaboration partners. Such actions should be documented and be accessible for future audit. Backup data copies should also be deleted.Automated deletion of research data is to be avoided. Plans outlining data deletion and destruction should be documented and agreed upon between the principal investigators or data producers with the IT department; see section 5 for more details on roles and responsibilities."

"If Research Data is to be deleted or destroyed, either because the agreed retention period has expired or for legal or ethical reasons, this should be done so in accordance with all legal, ethical, research funder and collaborator requirements with particular attention to confidentiality and security."

"Research data must be retained and disposed of securely according to the relevant retention and disposal schedule. This can be found at:, 7.10 If research data are to be deleted or destroyed, this should be done in accordance with all legal, ethical and funder requirements and only after the written approval of the Head of Division."

"If both “Hold” and “Delete” policies apply to a site, then both processes will take effect. If the retention hold is greater than the delete then files will not be deleted until after the hold period."

"Research data should be disposed of securely, in accordance with the DMP and the University’s Information Security Policy. The timing of data disposal should reflect any ethical, contractual and legal requirements, and current best practice."

"Data at the University of Suffolk are retained and disposed of according to need. The overarching principle is that data should only be retained and stored for as long as such data have a legitimate purpose, and thereafter they should be disposed of securely."

"If research data is to be deleted or destroyed, either because the agreed period of retention has expired or for legal or ethical reasons, this should be done so in accordance with all legal, ethical, research funder and collaborator requirements and with particular concern for confidentiality and security."

"Any destruction of Research Data shall be in accordance with the terms of this Policy, the retention period set out in the Data Management Plan and any legal and funder requirements, whichever is the most stringent."

"Disposal of data

Data that do not substantiate published research findings and do not have long-term value should be disposed of when they no longer serve a purpose. Data that contain personal or confidential information must be disposed of securely in accordance with University guidelines."

"Disposal of data

Data that do not substantiate published research findings and do not have long-term value should be disposed of when they no longer serve a purpose. Data that contain personal or confidential information must be disposed of securely in accordance with University guidelines."

"If research data and associated documents are to be deleted or destroyed after the retention period has expired, or for legal or ethical reasons, these measures need to take legal and ethical considerations into account. The deletion must be documented and justified. When deciding whether to retain or delete data, the interests and contractual provisions of third-party funders and other parties, in particular contributors and cooperation partners, must be taken into account. Aspects of security and confidentiality must be considered."

"The disposal and destruction of research data must be undertaken in accordance with the University's Data Protection and Data Disposal Policy. The agreed data disposal time and process should be included in the Data Management Plan and any participant-facing documents. If data have been shared with partners or transferred to third parties in the course of the project, researchers should ensure that they have deleted them unless they have a legitimate basis for retaining them."

# Question 3

In [29]:
user_input = "Under what circumstances what actors are the owners of the data? Please use exact quotes from the documents."

policy_name = None

In [30]:
df_documents

name,text
str,str
"""aalto-university.md""","""Aalto University"""
"""aalto-university.md""","""Aalto University Research Data…"
"""aalto-university.md""","""The research data management p…"
"""aalto-university.md""","""The data management policy sha…"
"""aalto-university.md""","""Ownership of copyright protect…"
…,…
"""wrexham-university.md""","""FAIR Data Where it is lawful …"
"""wrexham-university.md""","""Prifysgol Wrecsam Wrexham Univ…"
"""wrexham-university.md""","""Reporting a Data Incident/Brea…"
"""wrexham-university.md""","""Other Polices, Procedures, Leg…"


In [31]:
output = ask_llm(user_input, embedding_model, topic_embeddings, df_documents, pipe, policy_name)

bangor-university.md:
DATA PROTECTION POLICY  

Bangor University takes its responsibilities with regard to the management of the requirements of the General Data Protection Regulation (GDPR) and the Data Protection Act 2018 (the Act) very seriously. This document provides the policy framework through which this effective management can be achieved and audited., 1. Purpose & Scope  

The purpose of this policy is to ensure that the University and the University’s staff and students comply with the provisions of the GDPR and the Act and with any other relevant legislation in jurisdictions in which the University operates when processing personal data. Any infringement of the Act will be treated seriously by the University and may be considered under disciplinary procedures.  

This policy applies to staff, students, agents of the University and any authorised processors of personal data held or owned by the University, regardless of where the data is held and, in respect of automaticall

In [32]:
from IPython.display import Markdown, display

display(Markdown(output))

<|assistant|>
The actors who are the owners of the data are the University and its staff who process or use personal data. This includes honorary staff/associates, contractors, hourly paid lecturers, and any students or interns carrying out work on behalf of the University. The University is responsible for, and must be able to demonstrate, compliance with the GDPR and data protection.

# Question 4

In [33]:
user_input = "For different universities, under what circumstances what actors are the owners of the data? Please use exact quotes from the documents."

policy_name = None

In [34]:
df_documents

name,text
str,str
"""aalto-university.md""","""Aalto University"""
"""aalto-university.md""","""Aalto University Research Data…"
"""aalto-university.md""","""The research data management p…"
"""aalto-university.md""","""The data management policy sha…"
"""aalto-university.md""","""Ownership of copyright protect…"
…,…
"""wrexham-university.md""","""FAIR Data Where it is lawful …"
"""wrexham-university.md""","""Prifysgol Wrecsam Wrexham Univ…"
"""wrexham-university.md""","""Reporting a Data Incident/Brea…"
"""wrexham-university.md""","""Other Polices, Procedures, Leg…"


In [35]:
output = ask_llm(user_input, embedding_model, topic_embeddings, df_documents, pipe, policy_name)

bangor-university.md:
DATA PROTECTION POLICY  

Bangor University takes its responsibilities with regard to the management of the requirements of the General Data Protection Regulation (GDPR) and the Data Protection Act 2018 (the Act) very seriously. This document provides the policy framework through which this effective management can be achieved and audited., 1. Purpose & Scope  

The purpose of this policy is to ensure that the University and the University’s staff and students comply with the provisions of the GDPR and the Act and with any other relevant legislation in jurisdictions in which the University operates when processing personal data. Any infringement of the Act will be treated seriously by the University and may be considered under disciplinary procedures.  

This policy applies to staff, students, agents of the University and any authorised processors of personal data held or owned by the University, regardless of where the data is held and, in respect of automaticall

In [36]:
from IPython.display import Markdown, display

display(Markdown(output))

<|assistant|>
For different universities, the owners of the data are typically the universities themselves, as they are often the data controllers. This means they are responsible for processing personal data in accordance with the General Data Protection Regulation (GDPR) and other relevant data protection laws. The universities are responsible for ensuring that personal data is processed lawfully, fairly, and transparently, and that it is kept secure and not transferred to countries outside the European Economic Area (EEA) without appropriate safeguards.

For example, in the provided documents:

- **Bangor University** states that it is responsible for the use made of personal data by anyone working on its behalf, and it must ensure that personal and special category data is kept securely and returned to the University on completion of the work.

- **Utrecht University** requires researchers to adhere to the principles and requirements outlined in the GDPR and to report intended processing in the university's Data Register.

- **Karlstad University** emphasizes the need to handle personal data securely and to obtain approval from the Ethical Review Authority for the processing of sensitive personal data or information of criminal offenses.

- **University of Suffolk** commits to complying with all relevant legislation, particularly the Data Protection Act 2018 and the GDPR, and to providing support to staff handling personal data to remain compliant.

- **University of Aberdeen** must collect and manage personal data in compliance with the General Data Protection Regulation (GDPR) and the UK Data Protection Act 2018.

- **University of Groningen** is subject to the General Data Protection Regulation (GDPR), which governs the processing of personal data both inside and outside the EU/EEA.

In summary, the universities are the primary owners and controllers of the data, and they are responsible for ensuring compliance with data protection laws and regulations.