In [1]:
from tqdm.auto import tqdm
from transformers import AutoTokenizer
from evaluation import load_model
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

tokenizer = AutoTokenizer.from_pretrained('google/mt5-base', legacy=False)
from src.model_training.datasets.experiments_sanitize.complete_sanitization import DefinitionTestSet, DefinitionDataset

dataset_test = DefinitionTestSet.create_dataset(tokenizer, shuffle=True, seed=42, subset_test=-1, cache=False)
dataset_train, dataset_val = DefinitionDataset.create_dataset(tokenizer, shuffle=True, seed=42)



Map:   0%|          | 0/35738 [00:00<?, ? examples/s]

0it [00:00, ?it/s]

1530


Map:   0%|          | 0/288148 [00:00<?, ? examples/s]

Filter:   0%|          | 0/288022 [00:00<?, ? examples/s]

0it [00:00, ?it/s]

13131


Filter:   0%|          | 0/288013 [00:00<?, ? examples/s]

Map:   0%|          | 0/35664 [00:00<?, ? examples/s]

Filter:   0%|          | 0/35652 [00:00<?, ? examples/s]

0it [00:00, ?it/s]

1505


Filter:   0%|          | 0/35652 [00:00<?, ? examples/s]

In [3]:
dataset_test

Dataset({
    features: ['title', 'context_word', 'context_sentence', 'gt', 'input_ids', 'attention_mask', 'labels', 'prompt'],
    num_rows: 34189
})

In [5]:
for data in dataset_train:
    if len(data["gt"]) < 3:
        print(data)

In [39]:
def analyze_dataset(dataset, threshold=0.75):
    def get_embedding(word):
        # Tokenize the input word
        inputs = bert_tokenizer(word, return_tensors='pt').to(device)
        
        # Get the outputs from BERT
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # The outputs are of shape (batch_size, sequence_length, hidden_size)
        # We need the embeddings of the input token(s)
        embeddings = outputs.last_hidden_state
        
        # If the word is split into multiple tokens, we take the mean of their embeddings
        # Otherwise, we take the embedding of the single token
        word_embedding = embeddings.mean(dim=1).squeeze()
    
        return word_embedding

# Function to calculate cosine similarity
    def cosine_similarity(embedding1, embedding2):
        return torch.nn.functional.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()
    
    
    from HanTa import HanoverTagger as ht
    from transformers import BertModel, AutoTokenizer
    import torch
    
    bert_model = "dbmdz/bert-base-german-uncased"
    
    # Load the BERT model and tokenizer
    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model)
    bert_model = BertModel.from_pretrained(bert_model)
    bert_model.to(device)
    
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')

    empty_pairs = []
    bad_pairs = []
    good_pairs = []
    for i, (title, context_word) in tqdm(enumerate(zip(dataset["title"], dataset["context_word"]))):
        if title == context_word:
            continue
        if title == "" or context_word == "":
            empty_pairs.append((title, context_word))
            continue
        if (a:=str(tagger.analyze(title)[0])) != (b:=str(tagger.analyze(context_word)[0])):
            if title != b and a != context_word and a != b:
                word1_embedding = get_embedding(title)
                word2_embedding = get_embedding(context_word)
                
                # Calculate the cosine similarity between the two words
                similarity = cosine_similarity(word1_embedding, word2_embedding)
                if similarity < threshold:
                    print((i, title, context_word, similarity))
                    bad_pairs.append((i, title, context_word))
                else:
                    print((i, title, context_word, similarity))
                    good_pairs.append((i, title, context_word))
    return empty_pairs, good_pairs, bad_pairs

In [28]:
from HanTa import HanoverTagger as ht
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

def hantag(title, context_word):
    print((title, context_word))
    print((tagger.analyze(title)[0], tagger.analyze(context_word)[0]))

In [38]:
index = 9
a, b = dataset_test[index]['title'], dataset_test[index]['context_word']
a, b = 'Fensterbank', 'Petersilie'
hantag(a, b)

('Fensterbank', 'Petersilie')
('Fensterbank', 'Petersili')


In [40]:
empty_test, good_test, bad_test = analyze_dataset(dataset_test)

0it [00:00, ?it/s]

(1, 'schauern', 'schauerte', 0.7936298847198486)
(22, 'digitalisieren', 'digitalisierten', 0.9997422695159912)
(61, 'RheinlandPfälzerin', 'RheinlandPfälzerinnen', 0.902657151222229)
(103, 'Bibliografin', 'Bibliografinnen', 0.8803393244743347)
(202, 'Steinkauz', 'Steinkäuze', 0.8465087413787842)
(204, 'Schturmowik', 'Sturmowiks', 0.800934910774231)
(270, 'Spind', 'Spinden', 0.8661656379699707)
(339, 'tschechern', 'tschechert', 0.8270033597946167)
(348, 'Mündelgeld', 'Mündelgelder', 0.7688379287719727)
(371, 'Vormutter', 'Vormütter', 1.0000001192092896)
(414, 'Gefolgsmann', 'Gefolgsleute', 0.9322413206100464)
(443, 'besprayen', 'besprayt', 0.7845319509506226)
(490, 'bullern', 'bullert', 0.904456377029419)
(492, 'Sequoia', 'Sequoias', 0.8860237002372742)
(533, 'cyber', 'Cyber', 1.0000001192092896)
(557, 'aufknien', 'aufkniete', 0.8528078198432922)
(604, 'verbaseln', 'verbaselten', 0.8622446060180664)
(617, 'Alphabet', 'Alphabeten', 0.7795814275741577)
(650, 'Rechtsgelehrter', 'Rechtsgeleh

KeyboardInterrupt: 

In [None]:
empty_test, good_test, bad_test = analyze_dataset(dataset_test)
empty_train, good_train, bad_train = analyze_dataset(dataset_train)
empty_val, good_val, bad_val = analyze_dataset(dataset_val)


In [6]:
print(f"Empty: {len(empty_test)}, Good: {len(good_test)}, Bad: {len(bad_test)}")
print(f"Empty: {len(empty_train)}, Good: {len(good_train)}, Bad: {len(bad_train)}")
print(f"Empty: {len(empty_val)}, Good: {len(good_val)}, Bad: {len(bad_val)}")

Empty: 0, Good: 343, Bad: 0
Empty: 0, Good: 2892, Bad: 0
Empty: 0, Good: 331, Bad: 0


In [7]:
empty_train

[('fahen', ''), ('trans', ''), ('niederdeutsch', '')]

In [8]:
print(525/12966*100)
print(4611/103827*100)
print(473/12833*100)

4.049051365108746
4.44104134762634
3.6858100210395075


In [9]:
good_train

[(5, 'adjektivieren', 'Adjektivierens'),
 (25, 'übereinanderstapeln', 'Übereinanderstapeln'),
 (29, 'SeungOk', 'SungOk'),
 (61, 'Exerzitium', 'Exerzitien'),
 (143, 'Seeklima', 'Seeklimate'),
 (153, 'Nadelstich', 'Nadelstiche'),
 (177, 'Membran', 'Membranen'),
 (199, 'Schurf', 'Schürfe'),
 (214, 'gelangen', 'gelangten'),
 (252, 'Desubstantivum', 'Desubstantiva'),
 (279, 'Fingerabdruck', 'Fingerabdrücke'),
 (322, 'Großfigur', 'Großfiguren'),
 (359, 'blaken', 'blakten'),
 (364, 'Coronatoter', 'Coronatote'),
 (427, 'Urchristin', 'Urchristinnen'),
 (430, 'weitererzählen', 'hat'),
 (439, 'Polnisch', 'Polnische'),
 (441, 'Liegestuhl', 'Liegestühle'),
 (551, 'Kolonialbeamter', 'Kolonialbeamten'),
 (583, 'Umschweif', 'Umschweife'),
 (606, 'versprenkeln', 'versprenkel'),
 (634, 'Tribünenplatz', 'Tribünenplätze'),
 (663, 'slippen', 'slippte'),
 (665, 'klumpen', 'klumpte'),
 (692, 'Mittelfranzösisch', 'Mittelfranzösische'),
 (693, 'äugeln', 'Äugeln'),
 (730, 'Denkakt', 'Denkakte'),
 (774, 'Diureti

In [5]:
len(dataset_test["title"])

12966

In [None]:
from transformers import BertModel, BertTokenizer
import torch

# Load the BERT model and tokenizer
bert_model = BertModel.from_pretrained('google-bert/bert-base-german-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-german-uncased')
# bert_model.cuda()

def get_embedding(word):
    # Tokenize the input word
    inputs = bert_tokenizer(word, return_tensors='pt')
    
    # Get the outputs from BERT
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # The outputs are of shape (batch_size, sequence_length, hidden_size)
    # We need the embeddings of the input token(s)
    embeddings = outputs.last_hidden_state
    
    # If the word is split into multiple tokens, we take the mean of their embeddings
    # Otherwise, we take the embedding of the single token
    word_embedding = embeddings.mean(dim=1).squeeze()

    return word_embedding

# Function to calculate cosine similarity
def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()


similarity_scores = {}

bad_pairs = {}
good_pairs = {}

for i, title, context_word in tqdm(wrong_pairs):
    word1_embedding = get_embedding(title)
    word2_embedding = get_embedding(context_word)
    
    # Calculate the cosine similarity between the two words
    similarity = cosine_similarity(word1_embedding, word2_embedding)
    similarity_scores[(i, title, context_word)] = similarity
    if similarity < 0.75:
        bad_pairs.append((i, title, context_word))
    else:
        good_pairs.append((i, title, context_word))



In [30]:
import gensim

In [36]:
trained_model = gensim.models.KeyedVectors.load_word2vec_format("../german.model", binary=True)
# remove original vectors to free up memory
trained_model.init_sims(replace=True)

  trained_model.init_sims(replace=True)


In [37]:
wrong_pairs[0:10]

[('Hohlwelttheorie', 'Hohlwelttheorien'),
 ('herzukommen', 'kamen'),
 ('Protokoll errichten', 'wurden'),
 ('Elementarereignis', 'Elementarereignisse'),
 ('Mola', 'Molakana'),
 ('zusammensacken', 'sackt'),
 ('Abkauf', 'Abkaufe'),
 ('Büchertisch', 'Büchertische'),
 ('ausgucken', 'guckte'),
 ('bruchlanden', 'bruchlandet')]

In [44]:
scores = {}
not_existing = []
for pair in wrong_pairs:
    if pair[0] not in trained_model or pair[1] not in trained_model:
        not_existing.append(pair)
        continue
    scores[pair] = trained_model.similarity(*pair)

In [48]:
scores

{('herzukommen', 'kamen'): 0.21078113,
 ('Walross', 'Walrosse'): 0.559744,
 ('Rechtsgelehrter', 'Rechtsgelehrten'): 0.5164976,
 ('Leute', 'ironisch'): 0.4003573,
 ('Feststoff', 'Natronlauge'): 0.7216188,
 ('Somalia', 'Das'): 0.18570444,
 ('absetzen', 'setzt'): 0.29549432,
 ('aufgeben', 'gibt'): 0.3369042,
 ('Tapa', 'Tapas'): 0.39488146,
 ('rausfliegen', 'flog'): 0.34729895,
 ('hellgrau', 'hellgrauen'): 0.7123754,
 ('Ranke', 'Ranken'): 0.34429857,
 ('Bazillus', 'Bazillen'): 0.6139989,
 ('Symposion', 'Symposien'): 0.50672805,
 ('Drive', 'Drives'): 0.574168,
 ('Sims', 'Simse'): 0.34383586,
 ('zugehen', 'Geht'): 0.46973208,
 ('wegkommen', 'komm'): 0.36368445,
 ('emporsteigen', 'stiegen'): 0.26267195,
 ('anstecken', 'steckte'): 0.3337894,
 ('vorstehen', 'steht'): 0.26066333,
 ('Fensterbank', 'Petersilie'): 0.45396656,
 ('unterziehen', 'ziehen'): 0.31653014,
 ('modellieren', 'Modellieren'): 0.54722416,
 ('Zyklus', 'Zyklen'): 0.5734326,
 ('zusammenziehen', 'zog'): 0.22634603,
 ('berechtigen',

In [32]:
trained_model.similarity('Spielverlust', 'Spielverluste')

0.99999994

In [9]:
nlp = spacy.load('de_dep_news_trf')

In [11]:
nlp('Spielverlust')

Spielverlust

In [12]:
from HanTa import HanoverTagger as ht

tagger = ht.HanoverTagger('morphmodel_ger.pgz')


In [15]:
print(tagger.analyze('Spielverluste')[0])

Spielverlust


In [42]:
'Hohlwelttheorie' in trained_model

False

In [109]:
from transformers import BertModel, BertTokenizer
import torch

# Load the BERT model and tokenizer
bert_model = BertModel.from_pretrained('google-bert/bert-base-german-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-german-uncased')
# bert_model.cuda()
# Tokenize the input words
word1 = "test"
word2 = "hohlwelttheorie"

def get_embedding(word):
    # Tokenize the input word
    inputs = bert_tokenizer(word, return_tensors='pt')
    
    # Get the outputs from BERT
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # The outputs are of shape (batch_size, sequence_length, hidden_size)
    # We need the embeddings of the input token(s)
    embeddings = outputs.last_hidden_state
    
    # If the word is split into multiple tokens, we take the mean of their embeddings
    # Otherwise, we take the embedding of the single token
    word_embedding = embeddings.mean(dim=1).squeeze()

    return word_embedding
# Get the first and second word embeddings
word1_embedding = get_embedding(word1)
word2_embedding = get_embedding(word2)

import torch.nn.functional as F

# Function to calculate cosine similarity
def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()

# Calculate the cosine similarity between the two words
similarity = cosine_similarity(word1_embedding, word2_embedding)

print(similarity)
# tensor([0.9665, 0.7953, 0.9809], grad_fn=<SumBackward1>)

0.7517393231391907


In [99]:
torch.cosine_similarity(word1_embedding, word2_embedding)

tensor([0.7517])

In [122]:
similarity_scores = {}

bad_pairs = {}
good_pairs = {}

for pair in tqdm(wrong_pairs):
    word1_embedding = get_embedding(pair[0])
    word2_embedding = get_embedding(pair[1])
    
    # Calculate the cosine similarity between the two words
    similarity = cosine_similarity(word1_embedding, word2_embedding)
    similarity_scores[pair] = similarity
    if similarity < 0.75:
        bad_pairs[pair] = similarity
    else:
        good_pairs[pair] = similarity


  0%|          | 0/867 [00:00<?, ?it/s]

In [118]:
bad_pairs

{('anfreunden', 'sich'): 0.6428645849227905,
 ('jemanden auf den Arm nehmen', 'hat'): 0.5350866913795471,
 ('revanchieren', 'sich'): 0.6730716228485107,
 ('nicht von gestern sein', 'sind'): 0.591154932975769,
 ('Somalia', 'Das'): 0.6057213544845581,
 ('anspießen', 'wurde'): 0.5769628882408142,
 ('Gewinn', 'haben'): 0.6792787909507751,
 ('glasieren', 'habe'): 0.6123042106628418,
 ('wiederverwerten', 'werden'): 0.6857559680938721,
 ('umholzen', 'hatte'): 0.698523759841919,
 ('wegloben', 'lobte'): 0.6728106737136841,
 ('Abfallsortieranlage', 'wwwrosenbauercom'): 0.690182089805603,
 ('weitmaschig', 'weitmaſchige'): 0.6720466017723083,
 ('ausbüxen', 'sind'): 0.5914334654808044,
 ('fortgießen', 'fortgegoſſen'): 0.6367546319961548,
 ('mithelfen', 'hatte'): 0.6454353332519531,
 ('in der Luft liegen', 'lag'): 0.6879861950874329,
 ('gute Miene zum bösen Spiel machen', 'macht'): 0.6883102655410767,
 ('emporsteigen', 'stiegen'): 0.6665600538253784,
 ('anstecken', 'steckte'): 0.6426550149917603,
 (

In [124]:
for k, v in good_pairs.items():
    if v < 0.8:
        print(k)

('herzukommen', 'kamen')
('zusammensacken', 'sackt')
('ausgucken', 'guckte')
('zecken', 'gezeckt')
('aufheitern', 'heiterte')
('fortwollen', 'wollte')
('rausfliegen', 'flog')
('zurückmüssen', 'mußten')
('ausfressen', 'frisst')
('kohlen', 'kohlte')
('zugehen', 'Geht')
('herausrupfen', 'haben')
('nummerisch', 'Nummerisch')
('weit', 'weiter')
('ankotzen', 'kotzt')
('Individualstil', 'Indivudualstil')
('danebenliegen', 'lagen')
('ribbeln', 'Ribbeln')
('unterziehen', 'ziehen')
('modellieren', 'Modellieren')
('einfressen', 'frisst')
('berechtigen', 'berechtigt')
('rausfliegen', 'fliegt')
('seitlich', 'Seitlich')
('zerdrücken', 'zerdrückte')
('aufpflanzen', 'aufgepflanzten')
('behängen', 'werden')
('aufgeben', 'gab')
('Tübbing', 'tübbinge')
('herumbekommen', 'hat')
('zurückscheuen', 'scheut')
('dahinschmelzen', 'schmolz')
('Satinkleid', 'hauchzarten')
('betriebsam', 'betriebsamste')
('einbremsen', 'Einbremsen')
('anschimmeln', 'schimmeln')
('Trema', 'Diastema')
('hervorstehen', 'steht')
('bes

In [123]:
print(f"good pairs: {len(good_pairs)}")
print(f"bad pairs: {len(bad_pairs)}")

good pairs: 516
bad pairs: 183


In [107]:
torch.cosine_similarity(torch.tensor([[1.0]]), torch.tensor([[1.0]]))

tensor([1.])