# Reading test files:

In [None]:
import os
import json

path = "./test"
files = os.listdir(path)
data_dict  = {}

for file in files:
    lang = file.split("-")[0][-2:]
    data_list = []
    with open(os.path.join(path, file), "r") as f:
        for line in f:
            data = json.loads(line.strip())
            data_list.append(data)
    data_dict[lang] = data_list

# English
print(data_dict["en"])

[{'id': 'tst-en-1', 'lang': 'EN', 'model_input': 'Did Alberto Fouillioux ever play in a world cup championship?', 'model_output_text': ' No, Albero Foulois was not in any of the FIFA World Cup finals.\n', 'model_id': 'togethercomputer/Pythia-Chat-Base-7B', 'model_output_tokens': ['ĠNo', ',', 'ĠAlber', 'o', 'ĠF', 'oul', 'ois', 'Ġwas', 'Ġnot', 'Ġin', 'Ġany', 'Ġof', 'Ġthe', 'ĠFIFA', 'ĠWorld', 'ĠCup', 'Ġfinals', '.', 'Ċ'], 'model_output_logits': [-2.2868447304, 4.7311220169, 0.1059471965, 9.1925964355, 3.3247950077, 3.5171573162, 12.4979534149, 11.4856796265, 9.6516017914, 2.8469445705, 8.0528373718, 3.4117016792, 4.6727371216, 8.3649101257, 10.0596370697, 7.0108551979, 4.6345014572, 9.6700620651, 1.7395397425, 14.6126556396]}, {'id': 'tst-en-2', 'lang': 'EN', 'model_input': 'In which city was David Sandberg born?', 'model_output_text': 'David Sandburg was born in Stockholm, Sweden.', 'model_id': 'tiiuae/falcon-7b-instruct', 'model_output_tokens': ['David', 'ĠSand', 'burg', 'Ġwas', 'Ġborn'

# Setting up the NER pipeline:

In [None]:
import torch
device = torch.cuda.current_device() if torch.cuda.is_available() else None

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import gc

def load_model(model_id,  alt_pipeline=False):
    torch.cuda.empty_cache()
    gc.collect()

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForTokenClassification.from_pretrained(model_id)

    if not alt_pipeline:
        nlp = pipeline(
            "ner",
            model=model,
            tokenizer=tokenizer,
            aggregation_strategy="max",
            device=device
        )
    else:
        nlp = pipeline(
            "generic-ner", model=model,
            tokenizer=tokenizer,
            trust_remote_code=True,
            device='cpu')

    return nlp

# Entity extraction:

In [None]:
ner_models = {
    "model_1": {
        "id": "julian-schelb/roberta-ner-multilingual",
        "langs": ["en","de", "fr", "zh", "it", "es", "hi", "bn", "ar", "ru", "uk", "pt", "ur", "id", "ja", "ne", "nl", "tr", "ca", "bg", "zh-yue"]
    },
    "model_2": {
        "id": "richielo/small-e-czech-finetuned-ner-wikiann",
        "langs": ["cs"]
    },
    "model_3": {
        "id": "ixa-ehu/berteus-base-cased",
        "langs": ["eu"]
    },
    "model_4": {
        "id": "Kansallisarkisto/finbert-ner",
        "langs": ["fi"]
    }
}

# julian-schelb/roberta-ner-multilingual
model_1_langs = ["en","de", "fr", "zh", "it", "es", "hi", "bn", "ar", "ru", "uk", "pt", "ur", "id", "ja", "ne", "nl", "tr", "ca", "bg", "zh-yue"]

# popelucha/robeczech-NER
model_2_langs = ["cs"]

# ixa-ehu/berteus-base-cased
model_3_langs = ["eu"]

# Kansallisarkisto/finbert-ner
model_4_langs = ["fi"]

# hi, zh
for lang in data_dict:
    if lang in ner_models["model_1"]["langs"]:
        print(f"{lang} --- YES - model 1")
    elif lang in ner_models["model_2"]["langs"]:
        print(f"{lang} --- YES - model 2")
    elif lang in ner_models["model_3"]["langs"]:
        print(f"{lang} --- YES - model 3")
    elif lang in ner_models["model_4"]["langs"]:
        print(f"{lang} --- YES - model 4")
    else:
        print(f"{lang} --- NO")

fi --- YES - model 4
sv --- NO
de --- YES - model 1
en --- YES - model 1
es --- YES - model 1
fa --- NO
zh --- YES - model 1
ca --- YES - model 1
fr --- YES - model 1
ar --- YES - model 1
cs --- YES - model 2
eu --- YES - model 3
it --- YES - model 1
hi --- YES - model 1


Testing the entity extraction:

In [None]:
testquery = "In 1998, Stockholm was named European Capital of Culture."
nlp = load_model("julian-schelb/roberta-ner-multilingual")
test = nlp(testquery)
print(test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Device set to use cpu


[{'entity_group': 'LOC', 'score': 0.99116427, 'word': 'Stockholm', 'start': 9, 'end': 18}, {'entity_group': 'ORG', 'score': 0.73406106, 'word': 'EuropeanCapitalof', 'start': 29, 'end': 48}]




In [None]:
for lang in data_dict:
    print(lang)
    if lang in ner_models["model_1"]["langs"]:
        nlp = load_model(ner_models["model_1"]["id"])
        pass
    elif lang in ner_models["model_2"]["langs"]:
        nlp = load_model(ner_models["model_2"]["id"])
        pass
    elif lang in ner_models["model_3"]["langs"]:
        nlp = load_model(ner_models["model_3"]["id"])
        pass
    elif lang in ner_models["model_4"]["langs"]:
        nlp = load_model(ner_models["model_4"]["id"])
        pass
    else:
        continue

    for entry in data_dict[lang]:
        text = entry['model_input'] + entry["model_output_text"]
        entities = nlp(text)
        entities_set = set()
        try:
            for ent in entities:
                if ent['entity_group'] == 'PER' and len(text[ent['start']:ent['end']].split(' ')) > 1:
                    entities_set = {text[ent['start']:ent['end']].split("?")[0].replace("\n", " ")}
                    break
                else:
                    entities_set.add(text[ent['start']:ent['end']].split("?")[0].replace("\n", " "))
        except:
            for ent in entities:
                if ent['entity_group'] == 'PER' and len(ent['word'].split(' ')) > 1:
                    entities_set = {text[ent['start']:ent['end']].split("?")[0].replace("\n", " ")}
                    break
                else:
                    entities_set.add(ent['word'].split("?")[0].replace("\n", " "))
        print(entities_set)
        entry["entities"] = list(entities_set)

fi


Device set to use cpu


{'David Sandberg', 'YouTubessa', 'AtomicDave'}
{'HIFK', 'HFF', 'Axel Oxenstierna', 'Helsingin Jalkapalloklubin', 'Helsingfors Fotbollsklubb', 'vuonna 1907', 'Helsingin Jalkapalloklubi'}
set()
{'Ranskan', "Provence-Alpes-Côte d'Azurin", 'Bouches-du-Rhônen departementissa', 'Aix-en-Provencesta', "Herpy-l'Arlésienne", 'Marseillesta', 'Herpy-l’Arlésienne'}
{'Yhdysvallat', 'IIHF', 'vuoden 1931 jääkiekon maailmanmestaruuskilpailuihin', 'vuoden 1939', 'vuonna 1939', 'vuoden 1931 jääkiekon maailmanmestaruuskilpailuissa', 'Yhdysvaltoja', 'jääkiekon maailmanmestaruuskilpailuihin', 'IIHF:ään'}
{'UEFA', 'Barcelonan', 'Moskovassa', 'UEFA Super Cupin', 'venäläisen', 'Venäjällä', 'Barcelona', 'Super Cupin', 'Vuoden 2011', 'Zenitin', 'espanjalaisen', '11. elokuuta 2011', 'UEFA Super Cup', 'vuoden 2011'}
{'Moritz Volz', 'Moritz Volzin', 'Bundesliigan', 'saksalainen', 'VfB Stuttgartissa'}
{'Bert "Baldy" Olmstead', 'Bert Olmsteadia', 'Baldy'}
{'Suomessa', 'suomenkielinen'}
{'vuonna 1947', 'British Film A

In [None]:
print(data)

{'id': 'tst-sv-1', 'lang': 'SV', 'model_input': 'I vilken stad föddes regissören David Sandberg?', 'model_output_text': 'David Sandbergs födelseort är New York.', 'model_id': 'AI-Sweden-Models/gpt-sw3-6.7b-v2-instruct-gguf', 'model_output_tokens': [' David', ' Sand', 'bergs', ' födelse', 'ort', ' är', ' New', ' York', '.', '\n'], 'model_output_logits': [-0.1427002, 0.0, 0.0, -0.3305664, 0.0, 0.0, -0.11968994, 0.0, 0.0, 0.0]}


In [None]:
import json

print(data_dict["en"][0])

with open("test_data_entities.json", "w") as f:
    json.dump(data_dict, f, indent=4)

{'id': 'tst-en-1', 'lang': 'EN', 'model_input': 'Did Alberto Fouillioux ever play in a world cup championship?', 'model_output_text': ' No, Albero Foulois was not in any of the FIFA World Cup finals.\n', 'model_id': 'togethercomputer/Pythia-Chat-Base-7B', 'model_output_tokens': ['ĠNo', ',', 'ĠAlber', 'o', 'ĠF', 'oul', 'ois', 'Ġwas', 'Ġnot', 'Ġin', 'Ġany', 'Ġof', 'Ġthe', 'ĠFIFA', 'ĠWorld', 'ĠCup', 'Ġfinals', '.', 'Ċ'], 'model_output_logits': [-2.2868447304, 4.7311220169, 0.1059471965, 9.1925964355, 3.3247950077, 3.5171573162, 12.4979534149, 11.4856796265, 9.6516017914, 2.8469445705, 8.0528373718, 3.4117016792, 4.6727371216, 8.3649101257, 10.0596370697, 7.0108551979, 4.6345014572, 9.6700620651, 1.7395397425, 14.6126556396], 'entities': ['Alberto Fouillioux']}


# Load data:

In [None]:
import json

try:
    print(f"EXISTS: {data_dict}")
except NameError:
    with open("test_data_entities.json", "r") as f:
        data_dict = json.load(f)

    print(f"DATA LOADED: {data_dict}")

EXISTS: {'fi': [{'id': 'tst-fi-1', 'lang': 'FI', 'model_input': 'Millä nimellä David Sandberg tunnetaan YouTubessa?', 'model_output_text': 'David Sandberg tunnetaan paremmin nimellä "AtomicDave" YouTubessa. Hän on tunnettu sisällöntuottaja, joka julkaisee usein videoita, joissa hän pelaa pelejä ja tekee haasteita. Hänellä on yli miljoona tilaajaa YouTubessa, ja hän on tunnettu energisestä ja viihdyttävästä tyylistään.', 'model_id': 'LumiOpen/Poro-34B-chat', 'model_output_tokens': ['David', 'ĠSand', 'berg', 'Ġtunnetaan', 'Ġparemmin', 'ĠnimellÃ¤', 'Ġ"', 'Atomic', 'Dave', '"', 'ĠYouTub', 'essa', '.', 'ĠHÃ¤n', 'Ġon', 'Ġtunnettu', 'ĠsisÃ¤llÃ¶n', 'tu', 'ottaja', ',', 'Ġjoka', 'Ġjulkaisee', 'Ġusein', 'Ġvideoita', ',', 'Ġjoissa', 'ĠhÃ¤n', 'Ġpelaa', 'ĠpelejÃ¤', 'Ġja', 'Ġtekee', 'Ġhaasteita', '.', 'ĠHÃ¤nellÃ¤', 'Ġon', 'Ġyli', 'Ġmiljoona', 'Ġtilaa', 'jaa', 'ĠYouTub', 'essa', ',', 'Ġja', 'ĠhÃ¤n', 'Ġon', 'Ġtunnettu', 'Ġenerg', 'isestÃ¤', 'Ġja', 'ĠviihdyttÃ¤vÃ¤', 'stÃ¤', 'Ġtyy', 'list', 'Ã¤Ã¤n', '.'

# Retrieving relevant passages:

In [None]:
!pip install spacy -q
!pip install faiss-cpu -q
# !pip install faiss-gpu -q
!pip install Wikipedia -q

In [None]:
import wikipedia
import spacy
import numpy as np
import faiss
import subprocess

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

spacy_dict = {
    "en": "en_core_web_sm",
    "ca": "ca_core_news_sm",
    "it": "it_core_news_sm",
    "sv": "sv_core_news_sm",
    "eu": "en_core_web_sm", # unavailable
    "es": "es_core_news_sm",
    "fa": "en_core_web_sm", # unavailable
    "cs": "en_core_web_sm", # unavailable
    "hi": "en_core_web_sm", # unavailable
    "de": "de_core_news_sm",
    "fr": "fr_core_news_sm",
    "zh": "zh_core_web_sm",
    "ar": "en_core_web_sm", # unavailable
    "fi": "fi_core_news_sm"
}

for lang in data_dict:
    subprocess.run(["python", "-m", "spacy", "download", spacy_dict[lang], "-q"])

    nlp = spacy.load(spacy_dict[lang])

    wikipedia.set_lang(lang)

    output = {}

    for data in data_dict[lang]:
        print("-"*100)
        print(f"ID: {data['id']}, Query: {data['model_output_text']}")

        all_passages = []
        # print(data['entities'])
        for ent in data['entities']:
            try:
                ent_options = [ent.lower(), wikipedia.suggest(ent)] if wikipedia.suggest(ent) is not None else [ent.lower()]
                ent_options.extend(wikipedia.search(ent.lower(), results = 2))
                ent_options = list(set(ent_options))
                print(ent_options)
            except:
                continue

            for opt in ent_options:
                try:
                    page = wikipedia.page(opt, auto_suggest=False)

                    contents = page.content.replace("\n", " ")
                    contents = contents.replace("=", '') # not a perfect solution but should work fine for our purposes

                    # split contents into sentences:
                    doc = nlp(contents)
                    sentences = [sent.text for sent in doc.sents]
                    n = 5  # group size
                    m = 2  # overlap size
                    passages = [page.title + ". " + " ".join(sentences[i:i+n]) for i in range(0, len(sentences), n-m) if n-m > 0]
                    all_passages.extend(passages)
                except:
                    pass

        # Remove potential duplicates
        all_passages = list(set(all_passages))

        # Create the embeddings for each passage
        if len(all_passages) > 0:
            all_passage_embeddings = [model.encode(passage) for passage in all_passages]
            all_passage_embeddings = np.array(all_passage_embeddings).astype('float32')
            d = all_passage_embeddings.shape[1]
            index = faiss.IndexFlatL2(d)
            index.add(np.array(all_passage_embeddings))

            query_embeddings = model.encode(data['model_input'] + data['model_output_text']).astype('float32')
            k = 5
            distances, indices = index.search(np.array([query_embeddings]), k)

            top_passages = []
            for i, idx in enumerate(indices[0]):
                top_passages.append([all_passages[idx], float(distances[0][i])])
                print(f"Rank: {i+1}, Index: {idx}, Distance: {distances[0][i]}, Passage: {all_passages[idx]}")

        else:
            top_passages = []
            print(f"No passages found for {ent}")

        output[data['id']] = {
            "input": data['model_input'],
            "query": data['model_output_text'],
            "results": top_passages
            }

    with open(f"retrieved_passages_{lang}_v2.json", "w") as f:
        json.dump(output, f, indent=4)



----------------------------------------------------------------------------------------------------
ID: tst-fi-1, Query: David Sandberg tunnetaan paremmin nimellä "AtomicDave" YouTubessa. Hän on tunnettu sisällöntuottaja, joka julkaisee usein videoita, joissa hän pelaa pelejä ja tekee haasteita. Hänellä on yli miljoona tilaajaa YouTubessa, ja hän on tunnettu energisestä ja viihdyttävästä tyylistään.
['David F. Sandberg', 'David Sandberg', 'david sandberg']
['youtubessa', 'Youtube', 'Youtube Music']
['atomicdave']
Rank: 1, Index: 16, Distance: 5.381505966186523, Passage: David F. Sandberg. David F. Sandberg (s. 1981 Jönköping, Ruotsi), joka tunnetaan YouTubessa nimellä ponysmasher, on ruotsalainen elokuvaohjaaja.     Elämä ja ura  Sandberg aloitti uransa tekemällä omia lyhytelokuviaan ja levittämällä niitä YouTubessa. Hänen ensimmäinen pitkä elokuvansa oli Lights Out (2016), joka perustuu hänen samannimiseen lyhytelokuvaansa vuodelta 2013. Sandbergin puoliso on ruotsalainen näyttelijä 



  lis = BeautifulSoup(html).find_all('li')


['Axel Oxenstierna', 'axel oxenstierna', 'Axel']
['helsingin jalkapalloklubin', 'Helsingin Jalkapalloklubin historia', 'Helsingin Jalkapalloklubi']
['Helsingin Jalkapalloklubin historia', 'helsingfors fotbollsklubb', 'Helsingin Jalkapalloklubi']
['vuonna 1907', 'Luettelo vuonna 1907 kuolleista henkilöistä', 'Eduskuntavaalit 1907']
['Helsingin Jalkapalloklubi (naiset)', 'helsingin jalkapalloklubi', 'Helsingin Jalkapalloklubi']
Rank: 1, Index: 377, Distance: 4.873684406280518, Passage: Helsingin Jalkapalloklubi. Vuoden 1907 alussa Franz Fredrik Wathén esitti ajatuksensa julkisuudessa, ja saman vuoden toukokuussa tehtiin periaatteellinen päätös perustaa Helsingin Potkupalloklubi eli Helsingfors Sparkbollsklubb. Kesäkuussa seuran virallinen perustamiskokous pidettiin Kaisaniemessä ja uuden seuran nimeksi otettiin Helsingin Jalkapalloklubi eli Helsingfors Fotbollsklubb. Ensimmäisen jalkapallo-ottelunsa HJK pelasi vielä samana vuonna Tammisaaressa paikallista tehdasjoukkuetta vastaan. HJK ol

KeyboardInterrupt: 

Data inspection:

In [None]:
for p in top_passages:
    print(p)

["Stockholm. See also   Holmium—a chemical element named after Stockholm List of people from Stockholm Outline of Stockholm Ports of the Baltic Sea Stockholm syndrome    References     External links   Stockholm—official website Stockholm Visitors Board—the official visitors' guide Selma Lagerlöf's account of the history of Stockholm, in Ch. VII of The Wonderful Adventures of Nils Tunnelbana Karta 2023 – Tunnelbanan Stockholm", 19.223408]
["David F. Sandberg. He is currently attached to direct and produce the 2025 film adaptation of the video game Until Dawn.     Early life  David F. Sandberg was born in Jönköping on 21 January 1981. His younger brother, Joakim, is a video game developer best known as the creator of Iconoclasts. He attended Torpa School, where he met his future wife Lotta Losten when they were both around 11 years old, and they later attended Junedal School together. He grew up watching films and making his own films using his father's VHS-C camera.", 20.349953]
["Stoc