### requirements
+ (nb_black)
+ nltk
+ sentence_transformers
+ secrets

In [14]:
# format cells using black
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

# evaluate cosine_sim scores within certain threshold ranges

In [15]:
import os
import pickle
import re
import math
import pandas as pd
import numpy as np
import nltk
import string
import secrets # module for secure random numbers --not pseudo-random like "module: random"
import torch
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple

<IPython.core.display.Javascript object>

In [16]:
def get_sentences(text: str) -> List[str]:
    return sent_tokenize(text, language="german")

def remove_short_sentences(sentences: List[str], min_size: int) -> List[str]:
    return [
        s for s in sentences if len(word_tokenize(s, language="german")) >= min_size
    ]

def remove_punctuation_sent(sentences: List[str]) -> List[str]:
    sentences_clean = []
    for sent in sentences:
        words_clean = []
        words = word_tokenize(sent, language="german")
        for word in words:
            word_clean = [c for c in word if c not in string.punctuation]
            if word_clean:
                words_clean.append("".join(word_clean))
        sentences_clean.append(" ".join(words_clean))
    return sentences_clean

<IPython.core.display.Javascript object>

In [17]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

<IPython.core.display.Javascript object>

In [18]:
path = os.path.abspath("")
data_df_rel = "data/df/"
data_df_ndb_abs = os.path.join(path, data_df_rel, "df_ndb_wikipedia.pkl")
corpus = pd.read_pickle(data_df_ndb_abs)
corpus_rows = corpus[corpus.columns[0]].count()
corpus

Unnamed: 0,ndb_n,ndb_sfz,gnd,ndb_sex,ndb_name,wikipedia_title,ndb_text,wikipedia_text
0,n01-001-01,sfz45545,118643525,1,"Aachen Hans von (Johann von Achen, Hans Ach)",Hans von Aachen,"Karel van Mander (Schilderboeck, Haarlem 1604 ...",Hans von Aachen (* 1552 in Köln; † 4. März 161...
1,n01-001-02,sfz15,118500015,1,"Aal (Anguilla), Johannes",Johannes Aal,A.Aal verfocht zur Zeit der Reformation die al...,Johannes Aal (* um 1500 in Bremgarten AG; † 28...
2,n01-002-01,sfz17,104198273,1,dall' Abaco Evaristo Felice,Evaristo Felice Dall’Abaco,A.Abaco kam aus Verona über Modena 1704 an den...,Evaristo Felice Dall’Abaco (* 12. Juli 1675 in...
3,n01-002-02,sfz19,100002307,1,Abbadie Jacques,Jacques Abbadie,A.Abbadie studierte auf den Akademien von Saum...,"Jacques Abbadie, auch James, Jacobus oder Jaco..."
4,n01-002-03,sfz7153,118646419,1,Abbe Ernst Carl,Ernst Abbe,"Die Einsicht der Lehrer, die die außerordentli...",Ernst Karl Abbe [ˈabə] (* 23. Januar 1840 in ...
...,...,...,...,...,...,...,...,...
20479,n27-908-01,sfz141086,1116026791,2,Westphal-Hellbusch Sigrid Hellbusch,Sigrid Westphal-Hellbusch,Nach der Reifeprüfung am I. Oberlyzeum in Berl...,Sigrid Westphal-Hellbusch (* 10. Juni 1915 in ...
20480,n27-909-01,sfz60626,117327328,1,Westphalen Ferdinand,Ferdinand von Westphalen,W. Westphalen besuchte das Gymnasium in Salzw...,Ferdinand Otto Wilhelm Henning von Westphalen ...
20481,n27-910-01,sfz85259,117327573,1,Westrumb Friedrich,Johann Friedrich Westrumb,W. Westrumb begann 1764 eine Lehre in der kgl...,Johann Friedrich Westrumb (* 2. Dezember 1751 ...
20482,n27-911-01,sfz141099,1051891450,1,Wetter Ernst,Ernst Wetter,Nach der Volksschule absolvierte W. Wetter d...,Ernst Wetter (* 27. August 1877 in Töss (heute...


<IPython.core.display.Javascript object>

In [48]:
models = ["distiluse-base-multilingual-cased-v1","paraphrase-multilingual-MiniLM-L12-v2","paraphrase-multilingual-mpnet-base-v2"]
threshold_step = 0.01
thresholds = np.arange(0.6, 1, threshold_step).tolist()[::-1]
examples_per_threshold_max = 5

for model in models:
    st_model = SentenceTransformer(model)
    p = os.path.join(os.getcwd(), "data/reuse/models/"+model+".pkl")
    print("\n\n\n")
    print(f"# {model =}")
    with open(p, "rb") as f:
        d = pickle.load(f)
        for threshold in thresholds:
            print("\n\n")
            print(f"## threshold range = [{threshold:.2f},{threshold+threshold_step:.2f}]","\n")
            # get random biography -> remove alphabetic order bias (ndb volumes were published over a time span of decades)
            # get random version (index 0-4) :: [0,5)
            version_dict = d[model][secrets.randbelow(5)]
            version_list = list(version_dict.items())
            biographies_amount = len(version_list)
            gnd_visited = {}
            example_findings = 0
            while(len(gnd_visited)<biographies_amount and example_findings<examples_per_threshold_max):
                gnd,t = version_list[secrets.randbelow(biographies_amount)]
                if gnd in gnd_visited:
                    continue
                # ndb <- wiki
                if t[0]:
                    ndb_sentences_amount = len(t[0])
                    ixd_visited={}
                    while(len(ixd_visited)<ndb_sentences_amount):
                        random_idx = secrets.randbelow(ndb_sentences_amount)
                        if random_idx in ixd_visited:
                            continue
                        else:
                            ixd_visited[random_idx]=None
                        simval = t[0][random_idx]
                        if simval>=threshold and simval<=threshold+threshold_step:
                            wikipedia_title = corpus.loc[corpus['gnd'] == gnd]["wikipedia_title"].values[0]
                            ndb_volume = corpus.loc[corpus['gnd'] == gnd]["ndb_n"].values[0][1:3]
                            # get ndb text by gnd+corpus
                            ndb_text = corpus.loc[corpus['gnd'] == gnd]["ndb_text"].values[0] # pandas Series -> StringArray -> str
                            # get wikipedia text
                            wikipedia_text = corpus.loc[corpus['gnd'] == gnd]["wikipedia_text"].values[0] # pandas Series -> StringArray -> str
                            # get sentences
                            ndb_sent = get_sentences(ndb_text)
                            wikipedia_sent = get_sentences(wikipedia_text)
                            # remove punctuation
                            ndb_sent_clean = remove_punctuation_sent(ndb_sent)
                            wikipedia_sent_clean = remove_punctuation_sent(wikipedia_sent)
                            # remove sentences that are too short (<5 words)
                            ndb_sent_clean_long = remove_short_sentences(ndb_sent_clean, min_size=5)
                            wikipedia_sent_clean_long = remove_short_sentences(wikipedia_sent_clean, min_size=5)
                            assert len(t[0])==len(ndb_sent_clean_long) # t[0] :: ndb sentences with max wiki cosine_sim numeric matchup value    
                            # get ndb sentence for index i
                            ndb_sentence = ndb_sent_clean_long[random_idx]
                            sentences = [ndb_sentence] + wikipedia_sent_clean_long
                            sentences_vectors = st_model.encode(sentences)
                            cos_scores = cosine_similarity([sentences_vectors[0]], sentences_vectors[1:])[0] # list of list -> list
                            cos_scores_idx = []
                            for _i,score in enumerate(cos_scores):
                                cos_scores_idx.append((score,_i))
                            cos_scores_idx_filtered = [t for t in cos_scores_idx if t[0]>=threshold and t[0]<=threshold+threshold_step]
                            if not cos_scores_idx_filtered:
                                continue
                            gnd_visited[gnd]=None
                            example_findings+=1
                            print("")
                            print(f"### example = {example_findings}/{examples_per_threshold_max}; gnd = {gnd}; volume = {ndb_volume}/27; wikipedia title= {wikipedia_title}; format = ndb_sent/wikipedia_sent/cos_sim_score")
                            print(ndb_sentence)
                            pot_values_amount = len(cos_scores_idx_filtered)
                            random_drawn_idx = secrets.randbelow(pot_values_amount)
                            cosv_idx = cos_scores_idx_filtered[random_drawn_idx][1]
                            cosv = cos_scores_idx_filtered[random_drawn_idx][0]
                            # get ndb sentence for index cosv_idx
                            wikipedia_sentence = wikipedia_sent_clean_long[cosv_idx]
                            print(wikipedia_sentence)
                            print(f"cosine_similarity_score = {cosv}")
                            print("local text reuse properties = ")
                            print("local text reuse category = ")
                            break #goto next biography





# model ='distiluse-base-multilingual-cased-v1'



## threshold range = [0.99,1.00] 


### example = 1/5; gnd = 115531726; volume = 12/27; wikipedia title= Karl Kolbielski; format = ndb_sent/wikipedia_sent/cos_sim_score
Aus Hamburg brachte er den aus Manchester stammenden John Thornton nach Wien der für die österrösterreichischen Baumwolltextil und Spinnmaschinenindustrie bedeutend wurde
Aus Hamburg brachte er den aus Manchester stammenden Johann Thornton nach Wien der für die österreichischen Baumwolltextil und Spinnmaschinenindustrie bedeutend wurde
cosine_similarity_score = 0.9914122223854065
local text reuse properties = 
local text reuse category = 

### example = 2/5; gnd = 117051845; volume = 10/27; wikipedia title= Albert Jäger; format = ndb_sent/wikipedia_sent/cos_sim_score
1825 trat er in das Benediktinerstift Marienberg Vinschgau ein
1825 trat er in das Benediktinerstift Marienberg im Vinschgau ein
cosine_similarity_score = 0.9929158687591553
local text reuse properties 

<IPython.core.display.Javascript object>