In [1]:
from datasets import load_from_disk, disable_caching
import numpy as np
from gensim.models import KeyedVectors
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

def load_glove_model(glove_file):
    model = KeyedVectors.load_word2vec_format(glove_file, no_header=True, binary=False)
    return model

def preprocess_sentence(sentence, stop_words):
    tokens = word_tokenize(sentence.lower())
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]
    return set(tokens)

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def calculate_jaccard_similarity(sentences):
    stop_words = set(stopwords.words('english'))
    processed_sentences = [preprocess_sentence(sent, stop_words) for sent in sentences]
    n = len(processed_sentences)
    similarities = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            sim = jaccard_similarity(processed_sentences[i], processed_sentences[j])
            similarities[i, j] = sim
            similarities[j, i] = sim
    avg_similarity = np.sum(similarities) / (n * (n - 1))
    return avg_similarity

def calculate_jaccard_similarity_group(sentences1, sentences2):
    stop_words = set(stopwords.words('english'))
    processed_sentences1 = [preprocess_sentence(sent, stop_words) for sent in sentences1]
    processed_sentences2 = [preprocess_sentence(sent, stop_words) for sent in sentences2]

    total_similarity = 0
    count = 0

    for sent1 in processed_sentences1:
        for sent2 in processed_sentences2:
            sim = jaccard_similarity(sent1, sent2)
            total_similarity += sim
            count += 1

    avg_similarity = total_similarity / count if count > 0 else 0
    return avg_similarity

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dol28\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dol28\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
disable_caching()

In [3]:
ds = load_from_disk("../../datasets/ManualDataset")
ds = ds["train"].to_pandas()
ds_s = ds["original_text"].unique().tolist()

ds_zero = load_from_disk("../../datasets/ZeroShotDataset").to_pandas()
ds_zero_s = ds_zero["original_text"].unique().tolist()

ds_few = load_from_disk("../../datasets/TenShotDataset").to_pandas()
ds_few_s = ds_few["original_text"].unique().tolist()

ds_two = load_from_disk("../../datasets/TwoStageDataset").to_pandas()
ds_two_s = ds_two["original_text"].unique().tolist()

In [4]:
calculate_jaccard_similarity(ds_s)

0.019331977543731452

In [5]:
calculate_jaccard_similarity(ds_zero_s)

0.059935549630957735

In [6]:
calculate_jaccard_similarity(ds_few_s)

0.036234583270342985

In [7]:
calculate_jaccard_similarity(ds_two_s)

0.023632145589610014

In [8]:
calculate_jaccard_similarity_group(ds_s, ds_zero_s)

0.023224097514402416

In [9]:
calculate_jaccard_similarity_group(ds_s, ds_few_s)

0.022722382926315655

In [10]:
calculate_jaccard_similarity_group(ds_s, ds_two_s)

0.010858853504418002