In [2]:
# Importing libraries

import re
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import yaml
from git_root import git_root
from transformers import AutoTokenizer

In [3]:
# adding git_root and local imports
my_git_root = git_root()
sys.path.append(my_git_root)

from src import flatten, contains_text, extract_table

In [4]:
# importing data
df_documents_path = f'{my_git_root}/data/documents_cleaned.csv'
df_documents = pd.read_csv(df_documents_path)

In [5]:
# importing settings
with open(f'{my_git_root}/settings.yaml', 'r') as file:
    settings = yaml.safe_load(file)

print(settings)

sentence_transformer_model_name = settings['sentence_transformer_model_name']
sentence_transformer_max_tokens = settings['sentence_transformer_max_tokens']

{'sentence_transformer_model_name': 'thenlper/gte-small', 'sentence_transformer_max_tokens': 512, 'llm_model_name': 'tiiuae/Falcon3-7B-Instruct'}


In [6]:
# initializing tokenizer
tokenizer = AutoTokenizer.from_pretrained(sentence_transformer_model_name)

In [7]:
df_documents

Unnamed: 0,name,text
0,aalto-university.md,# Aalto University \n\n## Aalto University Re...
1,aberystwyth-university.md,# Research Data Management Policy \n\nVersion...
2,aston-university.md,# Research Data Management Policy \n\nJuly 20...
3,bangor-university.md,# Data Protection Policy \n\n Rev Date Purpos...
4,brunel-university-london.md,# Brunel University Research Data Management P...
...,...,...
137,universität-rostock.md,# Research Data Policy of the University of Ro...
138,universität-siegen.md,# Research-Data-Policy of the University of Si...
139,universität-stuttgart.md,# Research data management policy of the Unive...
140,utrecht-university.md,# University policy framework for research dat...


In [8]:
document_names = df_documents['name'].to_list()
documents = df_documents['text'].to_list()

In [None]:
def remove_empty_chunks(document):
    return [chunk for chunk in document if bool(re.search(r'[a-zA-Z]+', chunk)) if chunk is not None]

In [9]:
def chunk_document_regex(document, regex_delimiters):
    documents_chunked = []
    for key, value in regex_delimiters.items():
        document_chunked = re.split(value, document)
        document_chunked = [chunk for chunk in document_chunked if chunk is not None]
        documents_chunked.append(document_chunked)

    documents_chunked_len = [len(document_chunked) for document_chunked in documents_chunked]
    max_index = np.argmax(documents_chunked_len)
    max_len = documents_chunked_len[max_index]
    return documents_chunked, max_index, max_len

In [20]:
documents_chunked = []
chunk_history = []

explicit_delimiters = {
        'headings': rf'\s*#{{1,{1}}}\s*',
        'list': rf'\n\s*#*\s*\d+\.(\d+\.?){{0,{1-1}}}\s+',
}

for document in documents:
     document_chunked, max_index, max_len = chunk_document_regex(document, explicit_delimiters)
     documents_chunked.append(document_chunked[max_index])
     my_delimiter = list(explicit_delimiters.keys())[max_index]
     chunk_history.append(my_delimiter)

In [23]:
chunk_history

['headings',
 'list',
 'headings',
 'headings',
 'list',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'list',
 'headings',
 'list',
 'headings',
 'headings',
 'headings',
 'list',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'list',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'list',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'list',
 'headings',
 'headings',
 'headings',
 'headings',
 'list',
 'list',
 'headings',
 'list',
 'headings',
 'list',
 'headings',
 'headings',
 'list',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'headings',
 'list',
 'l

In [24]:
np.unique_counts(chunk_history)

UniqueCountsResult(values=array(['headings', 'list'], dtype='<U8'), counts=array([121,  21]))

In [21]:
for document_chunked in documents_chunked:
    for chunk in document_chunked:
        print(chunk)
        tokens = tokenizer.tokenize(chunk)
        if len(tokens) > sentence_transformer_max_tokens:
            print(chunk)
            print('_' * 100)


Aalto University

Aalto University Research Data Management Policy  

The research data management policy aims to make data management easier for the individual researcher. Managing and curating data creates competitive edge, and allocating resources for that is a strategic choice of the university. The purposes of this policy are to (1) encourage to informed decision making in research data management and (2) define the principles used in the opening of publicly funded scientific research data to achieve wide societal impact and the strategic goals of Aalto University.  

The data management policy shall be implemented through conscious and strategic decisions. Open access publishing of research data aims to make research data easily discoverable, assessable, intelligible, usable and interoperable. Where applicable, these requirements can also be applied to data other than open access published research data, and to the software needed to handle the open data. The decisions to publis

In [138]:
def chunk_document(document, chunk_history):
    result = []

    headings_level = chunk_history.count('headings') + 1
    list_level = chunk_history.count('list') + 1
    explicit_delimiters = {
        'headings': rf'\s*#{{1,{headings_level}}}\s*',
        'list': rf'\n\s*#*\s*\d+\.(\d+\.?){{0,{list_level-1}}}\s+',
    }

    implicit_delimiters = {
        'paragraphs': r'\s*\n\s*\n\s*',
    }

    fallback_delimiters = {
        'newlines': r'\s*\n\s*',
        'list_no_newlines': rf'#*\s*\d+\.(\d+\.?){{0,{list_level-1}}}\s+',
    }

    documents_chunked, max_index, max_len = chunk_document_regex(document, explicit_delimiters)

    if max_len > 1:
        my_delimiter = list(explicit_delimiters.keys())[max_index]
        print(my_delimiter)
        chunk_history.append(my_delimiter)
    else:
        documents_chunked, max_index, max_len = chunk_document_regex(document, implicit_delimiters)
        my_delimiter = list(implicit_delimiters.keys())[max_index]
        print(my_delimiter)
        if max_len > 1:
            chunk_history.append(my_delimiter)
        else:
            documents_chunked, max_index, max_len = chunk_document_regex(document, fallback_delimiters)
            my_delimiter = list(fallback_delimiters.keys())[max_index]
            print(my_delimiter)
            if max_len > 1:
                chunk_history.append(my_delimiter)
            else:
                assert False

    for chunk in documents_chunked[max_index]:
        chunk_tokens = tokenizer.tokenize(chunk)
        if len(chunk_tokens) <= sentence_transformer_max_tokens:
            result.append(chunk)
        else:
            result.append(chunk_document(chunk, chunk_history))
    return result

In [144]:
def chunk_document(document, chunk_history):
    result = []

    headings_level = chunk_history.count('headings') + 1
    list_level = chunk_history.count('list') + 1

    delimiters = {
        'explicit': {
            'headings': rf'\s*#{{1,{headings_level}}}\s*',
            'list': rf'\n\s*#*\s*\d+\.(\d+\.?){{0,{list_level-1}}}\s+',
        },
    }

    for delimiter_type in delimiters.values():
        documents_chunked, max_index, max_len = chunk_document_regex(document, delimiter_type)
        if max_len > 1:
            my_delimiter = list(delimiter_type.keys())[max_index]
            chunk_history.append(my_delimiter)
            break
    else:
        print(document)

    for chunk in documents_chunked[max_index]:
        chunk_tokens = tokenizer.tokenize(chunk)
        if len(chunk_tokens) <= sentence_transformer_max_tokens:
            result.append(chunk)
        else:
            result.append(chunk_document(chunk, chunk_history))
    return result

In [140]:
def remove_empty_chunks(document):
    return [chunk for chunk in document if bool(re.search(r'[a-zA-Z]+', chunk))]

In [145]:
names = []

documents_chunked = []

for index, document in enumerate(documents):
    document_chunked = chunk_document(document, list())
    document_chunked = flatten(document_chunked)
    document_chunked = remove_empty_chunks(document_chunked)
    names.append([document_names[index]] * len(document_chunked))
    documents_chunked.append(document_chunked)

names = flatten(names)
documents_chunked_flat = flatten(documents_chunked)

Aalto University Research Data Management Policy  

The research data management policy aims to make data management easier for the individual researcher. Managing and curating data creates competitive edge, and allocating resources for that is a strategic choice of the university. The purposes of this policy are to (1) encourage to informed decision making in research data management and (2) define the principles used in the opening of publicly funded scientific research data to achieve wide societal impact and the strategic goals of Aalto University.  

The data management policy shall be implemented through conscious and strategic decisions. Open access publishing of research data aims to make research data easily discoverable, assessable, intelligible, usable and interoperable. Where applicable, these requirements can also be applied to data other than open access published research data, and to the software needed to handle the open data. The decisions to publish software, other t

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Aalto University Research Data Management Policy  

The research data management policy aims to make data management easier for the individual researcher. Managing and curating data creates competitive edge, and allocating resources for that is a strategic choice of the university. The purposes of this policy are to (1) encourage to informed decision making in research data management and (2) define the principles used in the opening of publicly funded scientific research data to achieve wide societal impact and the strategic goals of Aalto University.  

The data management policy shall be implemented through conscious and strategic decisions. Open access publishing of research data aims to make research data easily discoverable, assessable, intelligible, usable and interoperable. Where applicable, these requirements can also be applied to data other than open access published research data, and to the software needed to handle the open data. The decisions to publish software, other t

RecursionError: maximum recursion depth exceeded

In [14]:
print(df_documents.iloc[141]['text'])

# Prifysgol Wrecsam Wrexham University

# Research Data Management Policy 2024

Approved 11th July 2024, Academic Board: 23.116.5

## Contents

Introduction
Purpose…
Scope 2
Definitions. 3
Principles . 3
Collection. 3
Ownership. 3
Organisation and Technical Measures 3
Data Management Plans and Data Protection Impact Assessments 3
Storage . 5
Data Sharing. 5
Transferring Data Outside the EU 6
Retention . 6
Disposal.. 7
Access .. 7
FAIR Data. 7
Reporting a Data Incident/Breach. 8
Other Polices, Procedures, Legislation. . 8
Wrexham University Policy and Procedure . 8
Legislation... 8

# Prifysgol Wrecsam Wrexham University

## Introduction

Wrexham University is committed to maintaining the highest standards of ethics, rigour, and integrity in all its research. It seeks to protect the dignity, rights, and welfare of all those involved in the research it produces. The University recognises its duty to safeguard the confidentiality of personal research data in keeping with its responsibilit

In [15]:
documents_chunked[141]

['Prifysgol Wrecsam Wrexham University',
 'Research Data Management Policy 2024\n\nApproved 11th July 2024, Academic Board: 23.116.5',
 'Contents\n\nIntroduction\nPurpose…\nScope 2\nDefinitions. 3\nPrinciples . 3\nCollection. 3\nOwnership. 3\nOrganisation and Technical Measures 3\nData Management Plans and Data Protection Impact Assessments 3\nStorage . 5\nData Sharing. 5\nTransferring Data Outside the EU 6\nRetention . 6\nDisposal.. 7\nAccess .. 7\nFAIR Data. 7\nReporting a Data Incident/Breach. 8\nOther Polices, Procedures, Legislation. . 8\nWrexham University Policy and Procedure . 8\nLegislation... 8',
 'Prifysgol Wrecsam Wrexham University',
 "Introduction\n\nWrexham University is committed to maintaining the highest standards of ethics, rigour, and integrity in all its research. It seeks to protect the dignity, rights, and welfare of all those involved in the research it produces. The University recognises its duty to safeguard the confidentiality of personal research data in kee

In [16]:
len(names)

3314

In [17]:
df_chunked = pd.DataFrame(data={
    'name': names,
    'text': documents_chunked,
})

ValueError: All arrays must be of the same length

In [56]:
df_chunked

Unnamed: 0,name,text
0,aalto-university.md,Aalto University
1,aalto-university.md,Aalto University Research Data Management Policy
2,aalto-university.md,The research data management policy aims to ma...
3,aalto-university.md,The data management policy shall be implemente...
4,aalto-university.md,Ownership of copyright protected research data...
...,...,...
4027,wrexham-university.md,"FAIR Data\n\nWhere it is lawful to do so, the ..."
4028,wrexham-university.md,Prifysgol Wrecsam Wrexham University\n\nPublic...
4029,wrexham-university.md,Reporting a Data Incident/Breach\n\nThe UK GDP...
4030,wrexham-university.md,"Other Polices, Procedures, Legislation\n\nThis..."


In [22]:
df_chunked.to_csv(f'{my_git_root}/data/documents_chunked.csv', index=False)

In [23]:
df_chunked.shape

(4032, 2)

In [15]:
assert False

AssertionError: 

In [35]:
index = 7

In [36]:
print(df_documents.iloc[index]['text'])

# ICBM Research Data Management Policy  

## Background & Motivation  

Data, records, and sample related information are fundamental to both high quality research and academic integrity: to verify and defend the process and outcomes of research, accurate and retrievable research data are essential. Increasing amounts of collected and produced raw data, as well as processed data, need to be retained in order to exploit their long-term value for research, teaching, and for wider exploitation for the public good.  

The purpose of data management is to maximise the academic value of research data by ensuring that such data is managed according to good practices for collection, curation, storage, management, retrieval, re-use, sharing, archiving, and access, appropriate for the data and discipline concerned.  

## Objectives  

This policy defines research data as all data, records, and sample related information created or collected for the purposes of analysis to generate original resea

In [37]:
print(df_documents.iloc[index]['name'])
print('_'*100)
for chunk in flatten(documents_chunked[index]):
    print(chunk)
    print('_'*50)


carl-von-ossietzky-universität-oldenburg.md
____________________________________________________________________________________________________

__________________________________________________
ICBM Research Data Management Policy
__________________________________________________
Background & Motivation  

Data, records, and sample related information are fundamental to both high quality research and academic integrity: to verify and defend the process and outcomes of research, accurate and retrievable research data are essential. Increasing amounts of collected and produced raw data, as well as processed data, need to be retained in order to exploit their long-term value for research, teaching, and for wider exploitation for the public good.  

The purpose of data management is to maximise the academic value of research data by ensuring that such data is managed according to good practices for collection, curation, storage, management, retrieval, re-use, sharing, archiving, and ac

In [38]:
documents_chunked_flat = flatten(documents_chunked)

In [39]:
documents_chunked_flat = [chunk for chunk in documents_chunked_flat if chunk is not None]

In [40]:
len(documents_chunked_flat)

5386

In [41]:
document_chunked_flat_len = [len(chunk) for chunk in documents_chunked_flat]

In [42]:
px.histogram(document_chunked_flat_len)

In [43]:
documents_chunked_flat_len = [len(chunk) for chunk in documents_chunked_flat]
max_index = np.argmax(documents_chunked_flat_len)

In [44]:
tokens = tokenizer.tokenize(documents_chunked_flat[max_index])
len(tokens)

502

In [45]:
print(documents_chunked_flat[max_index])

# Newcastle University Research Data Management Policy Principles & Code of Good Practice  

This document outlines the overall policy of the University and its expectations of its staff and key services. More detailed practical help and information can be found on the ‘Research Data oolkit’ website:    

olicy Principles 1. The project Principal Investigator (PI) at Newcastle University has overall responsibility for the appropriate storage, treatment (including making data sets suitable for publication) and security of research project data. PIs may delegate discrete responsibilities to other members of the project team and this should be recorded. 2. All research projects are encouraged to create a data management plan at the earliest practical stage of a project. The plan should outline a project’s approach to data, including costs and be reviewed regularly to ensure that practice remains in-line with expected standards. 3. All University staff are responsible for making themselves

In [46]:
len(documents_chunked_flat)

5386

In [47]:
documents_chunked_flat.to_csv(f'{my_git_root}/data/documents_chunked.csv')

AttributeError: 'list' object has no attribute 'to_csv'